{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 43800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00228310502283105, "grad_norm": 33.85030746459961, "learning_rate": 2.9965753424657534e-05, "loss": 4.7256, "step": 50 }, { "epoch": 0.0045662100456621, "grad_norm": 28.36233139038086, "learning_rate": 2.993150684931507e-05, "loss": 4.0093, "step": 100 }, { "epoch": 0.00684931506849315, "grad_norm": 40.79988098144531, "learning_rate": 2.9897260273972603e-05, "loss": 3.3789, "step": 150 }, { "epoch": 0.0091324200913242, "grad_norm": 38.37752914428711, "learning_rate": 2.9863013698630136e-05, "loss": 2.5531, "step": 200 }, { "epoch": 0.01141552511415525, "grad_norm": 46.2108268737793, "learning_rate": 2.9828767123287673e-05, "loss": 2.4412, "step": 250 }, { "epoch": 0.0136986301369863, "grad_norm": 22.16556739807129, "learning_rate": 2.9794520547945206e-05, "loss": 2.1968, "step": 300 }, { "epoch": 0.01598173515981735, "grad_norm": 38.14749526977539, "learning_rate": 2.9760273972602742e-05, "loss": 2.0913, "step": 350 }, { "epoch": 0.0182648401826484, "grad_norm": 38.938079833984375, "learning_rate": 2.9726027397260275e-05, "loss": 2.0564, "step": 400 }, { "epoch": 0.02054794520547945, "grad_norm": 51.680259704589844, "learning_rate": 2.969178082191781e-05, "loss": 1.9041, "step": 450 }, { "epoch": 0.0228310502283105, "grad_norm": 23.21047019958496, "learning_rate": 2.9657534246575345e-05, "loss": 1.7974, "step": 500 }, { "epoch": 0.02511415525114155, "grad_norm": 27.936336517333984, "learning_rate": 2.9623287671232878e-05, "loss": 1.6824, "step": 550 }, { "epoch": 0.0273972602739726, "grad_norm": 121.84676361083984, "learning_rate": 2.958904109589041e-05, "loss": 2.0853, "step": 600 }, { "epoch": 0.02968036529680365, "grad_norm": 51.72910690307617, "learning_rate": 2.9554794520547947e-05, "loss": 1.8569, "step": 650 }, { "epoch": 0.0319634703196347, "grad_norm": 27.697223663330078, "learning_rate": 2.952054794520548e-05, "loss": 1.7805, "step": 700 }, { "epoch": 0.03424657534246575, "grad_norm": 59.87869644165039, "learning_rate": 2.9486301369863017e-05, "loss": 1.8421, "step": 750 }, { "epoch": 0.0365296803652968, "grad_norm": 20.744386672973633, "learning_rate": 2.945205479452055e-05, "loss": 1.8149, "step": 800 }, { "epoch": 0.03881278538812785, "grad_norm": 22.272050857543945, "learning_rate": 2.9417808219178083e-05, "loss": 1.6892, "step": 850 }, { "epoch": 0.0410958904109589, "grad_norm": 34.34733963012695, "learning_rate": 2.938356164383562e-05, "loss": 1.6013, "step": 900 }, { "epoch": 0.04337899543378995, "grad_norm": 19.482553482055664, "learning_rate": 2.9349315068493152e-05, "loss": 1.6772, "step": 950 }, { "epoch": 0.045662100456621, "grad_norm": 19.61932945251465, "learning_rate": 2.9315068493150685e-05, "loss": 1.7372, "step": 1000 }, { "epoch": 0.04794520547945205, "grad_norm": 47.28583526611328, "learning_rate": 2.9280821917808222e-05, "loss": 1.6459, "step": 1050 }, { "epoch": 0.0502283105022831, "grad_norm": 74.12429809570312, "learning_rate": 2.9246575342465755e-05, "loss": 1.4242, "step": 1100 }, { "epoch": 0.05251141552511415, "grad_norm": 28.49104118347168, "learning_rate": 2.921232876712329e-05, "loss": 1.4958, "step": 1150 }, { "epoch": 0.0547945205479452, "grad_norm": 69.21525573730469, "learning_rate": 2.9178082191780824e-05, "loss": 1.4848, "step": 1200 }, { "epoch": 0.05707762557077625, "grad_norm": 28.827421188354492, "learning_rate": 2.9143835616438357e-05, "loss": 1.451, "step": 1250 }, { "epoch": 0.0593607305936073, "grad_norm": 69.60295867919922, "learning_rate": 2.910958904109589e-05, "loss": 1.6166, "step": 1300 }, { "epoch": 0.06164383561643835, "grad_norm": 43.529075622558594, "learning_rate": 2.9075342465753423e-05, "loss": 1.5593, "step": 1350 }, { "epoch": 0.0639269406392694, "grad_norm": 21.382102966308594, "learning_rate": 2.9041095890410956e-05, "loss": 1.5083, "step": 1400 }, { "epoch": 0.06621004566210045, "grad_norm": 38.85325622558594, "learning_rate": 2.9006849315068493e-05, "loss": 1.6731, "step": 1450 }, { "epoch": 0.0684931506849315, "grad_norm": 31.006072998046875, "learning_rate": 2.8972602739726026e-05, "loss": 1.4695, "step": 1500 }, { "epoch": 0.07077625570776255, "grad_norm": 26.75164222717285, "learning_rate": 2.8938356164383562e-05, "loss": 1.574, "step": 1550 }, { "epoch": 0.0730593607305936, "grad_norm": 38.76344299316406, "learning_rate": 2.8904109589041095e-05, "loss": 1.5674, "step": 1600 }, { "epoch": 0.07534246575342465, "grad_norm": 34.11776351928711, "learning_rate": 2.886986301369863e-05, "loss": 1.3804, "step": 1650 }, { "epoch": 0.0776255707762557, "grad_norm": 45.00776290893555, "learning_rate": 2.8835616438356165e-05, "loss": 1.5021, "step": 1700 }, { "epoch": 0.07990867579908675, "grad_norm": 85.93338775634766, "learning_rate": 2.8801369863013698e-05, "loss": 1.4673, "step": 1750 }, { "epoch": 0.0821917808219178, "grad_norm": 20.37348747253418, "learning_rate": 2.876712328767123e-05, "loss": 1.5315, "step": 1800 }, { "epoch": 0.08447488584474885, "grad_norm": 18.73834800720215, "learning_rate": 2.8732876712328767e-05, "loss": 1.4074, "step": 1850 }, { "epoch": 0.0867579908675799, "grad_norm": 38.19795608520508, "learning_rate": 2.86986301369863e-05, "loss": 1.4442, "step": 1900 }, { "epoch": 0.08904109589041095, "grad_norm": 18.31586265563965, "learning_rate": 2.8664383561643837e-05, "loss": 1.4918, "step": 1950 }, { "epoch": 0.091324200913242, "grad_norm": 44.28574752807617, "learning_rate": 2.863013698630137e-05, "loss": 1.3071, "step": 2000 }, { "epoch": 0.09360730593607305, "grad_norm": 51.20829391479492, "learning_rate": 2.8595890410958903e-05, "loss": 1.6107, "step": 2050 }, { "epoch": 0.0958904109589041, "grad_norm": 45.16651153564453, "learning_rate": 2.856164383561644e-05, "loss": 1.1947, "step": 2100 }, { "epoch": 0.09817351598173515, "grad_norm": 3.6301872730255127, "learning_rate": 2.8527397260273972e-05, "loss": 1.4133, "step": 2150 }, { "epoch": 0.1004566210045662, "grad_norm": 19.277009963989258, "learning_rate": 2.8493150684931505e-05, "loss": 1.2874, "step": 2200 }, { "epoch": 0.10273972602739725, "grad_norm": 36.330936431884766, "learning_rate": 2.8458904109589042e-05, "loss": 1.4955, "step": 2250 }, { "epoch": 0.1050228310502283, "grad_norm": 15.057435035705566, "learning_rate": 2.8424657534246575e-05, "loss": 1.3309, "step": 2300 }, { "epoch": 0.10730593607305935, "grad_norm": 31.65122413635254, "learning_rate": 2.839041095890411e-05, "loss": 1.3836, "step": 2350 }, { "epoch": 0.1095890410958904, "grad_norm": 26.276851654052734, "learning_rate": 2.8356164383561644e-05, "loss": 1.4872, "step": 2400 }, { "epoch": 0.11187214611872145, "grad_norm": 80.19303131103516, "learning_rate": 2.8321917808219177e-05, "loss": 1.4708, "step": 2450 }, { "epoch": 0.1141552511415525, "grad_norm": 23.674320220947266, "learning_rate": 2.8287671232876714e-05, "loss": 1.1475, "step": 2500 }, { "epoch": 0.11643835616438356, "grad_norm": 29.5810604095459, "learning_rate": 2.8253424657534247e-05, "loss": 1.5903, "step": 2550 }, { "epoch": 0.1187214611872146, "grad_norm": 32.49135208129883, "learning_rate": 2.821917808219178e-05, "loss": 1.3248, "step": 2600 }, { "epoch": 0.12100456621004566, "grad_norm": 33.26783752441406, "learning_rate": 2.8184931506849316e-05, "loss": 1.5587, "step": 2650 }, { "epoch": 0.1232876712328767, "grad_norm": 34.50761795043945, "learning_rate": 2.815068493150685e-05, "loss": 1.3713, "step": 2700 }, { "epoch": 0.12557077625570776, "grad_norm": 21.493181228637695, "learning_rate": 2.8116438356164386e-05, "loss": 1.2665, "step": 2750 }, { "epoch": 0.1278538812785388, "grad_norm": 10.42421817779541, "learning_rate": 2.808219178082192e-05, "loss": 1.3406, "step": 2800 }, { "epoch": 0.13013698630136986, "grad_norm": 18.451990127563477, "learning_rate": 2.8047945205479452e-05, "loss": 1.2502, "step": 2850 }, { "epoch": 0.1324200913242009, "grad_norm": 27.949146270751953, "learning_rate": 2.801369863013699e-05, "loss": 1.2492, "step": 2900 }, { "epoch": 0.13470319634703196, "grad_norm": 19.943056106567383, "learning_rate": 2.797945205479452e-05, "loss": 1.4071, "step": 2950 }, { "epoch": 0.136986301369863, "grad_norm": 30.464204788208008, "learning_rate": 2.7945205479452054e-05, "loss": 1.2135, "step": 3000 }, { "epoch": 0.13926940639269406, "grad_norm": 20.832355499267578, "learning_rate": 2.791095890410959e-05, "loss": 1.2872, "step": 3050 }, { "epoch": 0.1415525114155251, "grad_norm": 13.37592601776123, "learning_rate": 2.7876712328767124e-05, "loss": 1.1165, "step": 3100 }, { "epoch": 0.14383561643835616, "grad_norm": 36.62568664550781, "learning_rate": 2.784246575342466e-05, "loss": 1.2849, "step": 3150 }, { "epoch": 0.1461187214611872, "grad_norm": 11.884871482849121, "learning_rate": 2.7808219178082193e-05, "loss": 1.241, "step": 3200 }, { "epoch": 0.14840182648401826, "grad_norm": 22.552181243896484, "learning_rate": 2.7773972602739726e-05, "loss": 1.1695, "step": 3250 }, { "epoch": 0.1506849315068493, "grad_norm": 29.05255126953125, "learning_rate": 2.7739726027397263e-05, "loss": 1.3266, "step": 3300 }, { "epoch": 0.15296803652968036, "grad_norm": 27.039731979370117, "learning_rate": 2.7705479452054796e-05, "loss": 1.4528, "step": 3350 }, { "epoch": 0.1552511415525114, "grad_norm": 12.077860832214355, "learning_rate": 2.767123287671233e-05, "loss": 1.0587, "step": 3400 }, { "epoch": 0.15753424657534246, "grad_norm": 33.64236068725586, "learning_rate": 2.7636986301369865e-05, "loss": 1.3053, "step": 3450 }, { "epoch": 0.1598173515981735, "grad_norm": 7.781187534332275, "learning_rate": 2.76027397260274e-05, "loss": 1.5692, "step": 3500 }, { "epoch": 0.16210045662100456, "grad_norm": 58.28102111816406, "learning_rate": 2.7568493150684935e-05, "loss": 1.0579, "step": 3550 }, { "epoch": 0.1643835616438356, "grad_norm": 46.713260650634766, "learning_rate": 2.7534246575342468e-05, "loss": 1.1327, "step": 3600 }, { "epoch": 0.16666666666666666, "grad_norm": 38.657493591308594, "learning_rate": 2.75e-05, "loss": 1.299, "step": 3650 }, { "epoch": 0.1689497716894977, "grad_norm": 7.117087364196777, "learning_rate": 2.7465753424657537e-05, "loss": 1.0367, "step": 3700 }, { "epoch": 0.17123287671232876, "grad_norm": 33.10891342163086, "learning_rate": 2.743150684931507e-05, "loss": 1.1318, "step": 3750 }, { "epoch": 0.1735159817351598, "grad_norm": 38.68271255493164, "learning_rate": 2.7397260273972603e-05, "loss": 1.1906, "step": 3800 }, { "epoch": 0.17579908675799086, "grad_norm": 45.01628494262695, "learning_rate": 2.736301369863014e-05, "loss": 1.1285, "step": 3850 }, { "epoch": 0.1780821917808219, "grad_norm": 20.761018753051758, "learning_rate": 2.732876712328767e-05, "loss": 1.4117, "step": 3900 }, { "epoch": 0.18036529680365296, "grad_norm": 46.919952392578125, "learning_rate": 2.7294520547945206e-05, "loss": 1.0591, "step": 3950 }, { "epoch": 0.182648401826484, "grad_norm": 78.8492431640625, "learning_rate": 2.726027397260274e-05, "loss": 1.3741, "step": 4000 }, { "epoch": 0.18493150684931506, "grad_norm": 25.57372283935547, "learning_rate": 2.7226027397260272e-05, "loss": 1.1287, "step": 4050 }, { "epoch": 0.1872146118721461, "grad_norm": 16.47197723388672, "learning_rate": 2.719178082191781e-05, "loss": 1.3216, "step": 4100 }, { "epoch": 0.18949771689497716, "grad_norm": 36.24203109741211, "learning_rate": 2.715753424657534e-05, "loss": 1.3095, "step": 4150 }, { "epoch": 0.1917808219178082, "grad_norm": 8.002535820007324, "learning_rate": 2.7123287671232875e-05, "loss": 1.1426, "step": 4200 }, { "epoch": 0.19406392694063926, "grad_norm": 9.112885475158691, "learning_rate": 2.708904109589041e-05, "loss": 1.4292, "step": 4250 }, { "epoch": 0.1963470319634703, "grad_norm": 7.080036640167236, "learning_rate": 2.7054794520547944e-05, "loss": 1.2665, "step": 4300 }, { "epoch": 0.19863013698630136, "grad_norm": 8.403115272521973, "learning_rate": 2.702054794520548e-05, "loss": 1.3686, "step": 4350 }, { "epoch": 0.2009132420091324, "grad_norm": 29.843015670776367, "learning_rate": 2.6986301369863014e-05, "loss": 1.0842, "step": 4400 }, { "epoch": 0.20319634703196346, "grad_norm": 16.962310791015625, "learning_rate": 2.6952054794520547e-05, "loss": 1.1983, "step": 4450 }, { "epoch": 0.2054794520547945, "grad_norm": 19.98363494873047, "learning_rate": 2.6917808219178083e-05, "loss": 1.136, "step": 4500 }, { "epoch": 0.20776255707762556, "grad_norm": 23.507699966430664, "learning_rate": 2.6883561643835616e-05, "loss": 1.4167, "step": 4550 }, { "epoch": 0.2100456621004566, "grad_norm": 85.11144256591797, "learning_rate": 2.684931506849315e-05, "loss": 1.0667, "step": 4600 }, { "epoch": 0.21232876712328766, "grad_norm": 25.13855743408203, "learning_rate": 2.6815068493150686e-05, "loss": 1.1755, "step": 4650 }, { "epoch": 0.2146118721461187, "grad_norm": 21.816020965576172, "learning_rate": 2.678082191780822e-05, "loss": 1.1357, "step": 4700 }, { "epoch": 0.21689497716894976, "grad_norm": 27.655017852783203, "learning_rate": 2.6746575342465755e-05, "loss": 1.2578, "step": 4750 }, { "epoch": 0.2191780821917808, "grad_norm": 24.329933166503906, "learning_rate": 2.6712328767123288e-05, "loss": 1.1243, "step": 4800 }, { "epoch": 0.22146118721461186, "grad_norm": 9.320807456970215, "learning_rate": 2.667808219178082e-05, "loss": 1.2211, "step": 4850 }, { "epoch": 0.2237442922374429, "grad_norm": 14.245413780212402, "learning_rate": 2.6643835616438358e-05, "loss": 1.2804, "step": 4900 }, { "epoch": 0.22602739726027396, "grad_norm": 46.82653045654297, "learning_rate": 2.660958904109589e-05, "loss": 1.2949, "step": 4950 }, { "epoch": 0.228310502283105, "grad_norm": 43.06188201904297, "learning_rate": 2.6575342465753424e-05, "loss": 1.2012, "step": 5000 }, { "epoch": 0.23059360730593606, "grad_norm": 50.084381103515625, "learning_rate": 2.654109589041096e-05, "loss": 1.2965, "step": 5050 }, { "epoch": 0.2328767123287671, "grad_norm": 32.71940994262695, "learning_rate": 2.6506849315068493e-05, "loss": 1.0847, "step": 5100 }, { "epoch": 0.23515981735159816, "grad_norm": 53.56361389160156, "learning_rate": 2.647260273972603e-05, "loss": 1.2125, "step": 5150 }, { "epoch": 0.2374429223744292, "grad_norm": 57.06380081176758, "learning_rate": 2.6438356164383563e-05, "loss": 1.06, "step": 5200 }, { "epoch": 0.23972602739726026, "grad_norm": 41.62690734863281, "learning_rate": 2.6404109589041096e-05, "loss": 1.1048, "step": 5250 }, { "epoch": 0.2420091324200913, "grad_norm": 44.06789016723633, "learning_rate": 2.6369863013698632e-05, "loss": 1.2219, "step": 5300 }, { "epoch": 0.24429223744292236, "grad_norm": 14.321037292480469, "learning_rate": 2.6335616438356165e-05, "loss": 1.2137, "step": 5350 }, { "epoch": 0.2465753424657534, "grad_norm": 31.366552352905273, "learning_rate": 2.6301369863013698e-05, "loss": 1.2281, "step": 5400 }, { "epoch": 0.24885844748858446, "grad_norm": 19.642141342163086, "learning_rate": 2.6267123287671235e-05, "loss": 1.0646, "step": 5450 }, { "epoch": 0.2511415525114155, "grad_norm": 24.77635955810547, "learning_rate": 2.6232876712328768e-05, "loss": 1.0827, "step": 5500 }, { "epoch": 0.2534246575342466, "grad_norm": 21.257320404052734, "learning_rate": 2.6198630136986304e-05, "loss": 1.1798, "step": 5550 }, { "epoch": 0.2557077625570776, "grad_norm": 7.442146301269531, "learning_rate": 2.6164383561643837e-05, "loss": 1.2409, "step": 5600 }, { "epoch": 0.2579908675799087, "grad_norm": 40.682579040527344, "learning_rate": 2.613013698630137e-05, "loss": 1.2001, "step": 5650 }, { "epoch": 0.2602739726027397, "grad_norm": 22.984914779663086, "learning_rate": 2.6095890410958907e-05, "loss": 1.0922, "step": 5700 }, { "epoch": 0.2625570776255708, "grad_norm": 14.088912010192871, "learning_rate": 2.606164383561644e-05, "loss": 1.0361, "step": 5750 }, { "epoch": 0.2648401826484018, "grad_norm": 26.859743118286133, "learning_rate": 2.6027397260273973e-05, "loss": 1.3059, "step": 5800 }, { "epoch": 0.2671232876712329, "grad_norm": 27.274446487426758, "learning_rate": 2.599315068493151e-05, "loss": 1.2146, "step": 5850 }, { "epoch": 0.2694063926940639, "grad_norm": 15.877359390258789, "learning_rate": 2.5958904109589042e-05, "loss": 1.047, "step": 5900 }, { "epoch": 0.271689497716895, "grad_norm": 27.11214256286621, "learning_rate": 2.592465753424658e-05, "loss": 1.1303, "step": 5950 }, { "epoch": 0.273972602739726, "grad_norm": 25.463417053222656, "learning_rate": 2.589041095890411e-05, "loss": 0.9916, "step": 6000 }, { "epoch": 0.2762557077625571, "grad_norm": 5.419765472412109, "learning_rate": 2.5856164383561645e-05, "loss": 1.3228, "step": 6050 }, { "epoch": 0.2785388127853881, "grad_norm": 26.771987915039062, "learning_rate": 2.582191780821918e-05, "loss": 1.1163, "step": 6100 }, { "epoch": 0.2808219178082192, "grad_norm": 40.04314422607422, "learning_rate": 2.5787671232876714e-05, "loss": 1.2227, "step": 6150 }, { "epoch": 0.2831050228310502, "grad_norm": 31.262903213500977, "learning_rate": 2.5753424657534247e-05, "loss": 1.0956, "step": 6200 }, { "epoch": 0.2853881278538813, "grad_norm": 8.211708068847656, "learning_rate": 2.5719178082191784e-05, "loss": 0.8686, "step": 6250 }, { "epoch": 0.2876712328767123, "grad_norm": 7.24912166595459, "learning_rate": 2.5684931506849317e-05, "loss": 1.2297, "step": 6300 }, { "epoch": 0.2899543378995434, "grad_norm": 11.469908714294434, "learning_rate": 2.5650684931506853e-05, "loss": 1.1037, "step": 6350 }, { "epoch": 0.2922374429223744, "grad_norm": 11.30005168914795, "learning_rate": 2.5616438356164386e-05, "loss": 1.1043, "step": 6400 }, { "epoch": 0.2945205479452055, "grad_norm": 9.771041870117188, "learning_rate": 2.558219178082192e-05, "loss": 1.09, "step": 6450 }, { "epoch": 0.2968036529680365, "grad_norm": 22.025493621826172, "learning_rate": 2.5547945205479452e-05, "loss": 1.1252, "step": 6500 }, { "epoch": 0.2990867579908676, "grad_norm": 22.842763900756836, "learning_rate": 2.5513698630136985e-05, "loss": 1.0219, "step": 6550 }, { "epoch": 0.3013698630136986, "grad_norm": 49.29541015625, "learning_rate": 2.5479452054794518e-05, "loss": 0.8537, "step": 6600 }, { "epoch": 0.3036529680365297, "grad_norm": 32.22005844116211, "learning_rate": 2.5445205479452055e-05, "loss": 1.1211, "step": 6650 }, { "epoch": 0.3059360730593607, "grad_norm": 49.50102615356445, "learning_rate": 2.5410958904109588e-05, "loss": 1.0462, "step": 6700 }, { "epoch": 0.3082191780821918, "grad_norm": 69.23637390136719, "learning_rate": 2.5376712328767124e-05, "loss": 0.9964, "step": 6750 }, { "epoch": 0.3105022831050228, "grad_norm": 15.762770652770996, "learning_rate": 2.5342465753424657e-05, "loss": 1.3032, "step": 6800 }, { "epoch": 0.3127853881278539, "grad_norm": 36.22660827636719, "learning_rate": 2.530821917808219e-05, "loss": 1.1776, "step": 6850 }, { "epoch": 0.3150684931506849, "grad_norm": 6.89371395111084, "learning_rate": 2.5273972602739727e-05, "loss": 1.1224, "step": 6900 }, { "epoch": 0.317351598173516, "grad_norm": 76.80335235595703, "learning_rate": 2.523972602739726e-05, "loss": 0.9386, "step": 6950 }, { "epoch": 0.319634703196347, "grad_norm": 50.3594970703125, "learning_rate": 2.5205479452054793e-05, "loss": 1.0721, "step": 7000 }, { "epoch": 0.3219178082191781, "grad_norm": 26.805213928222656, "learning_rate": 2.517123287671233e-05, "loss": 1.2563, "step": 7050 }, { "epoch": 0.3242009132420091, "grad_norm": 39.83240509033203, "learning_rate": 2.5136986301369862e-05, "loss": 1.0924, "step": 7100 }, { "epoch": 0.3264840182648402, "grad_norm": 41.219818115234375, "learning_rate": 2.51027397260274e-05, "loss": 1.0771, "step": 7150 }, { "epoch": 0.3287671232876712, "grad_norm": 63.68681335449219, "learning_rate": 2.5068493150684932e-05, "loss": 1.1793, "step": 7200 }, { "epoch": 0.3310502283105023, "grad_norm": 96.54421997070312, "learning_rate": 2.5034246575342465e-05, "loss": 1.185, "step": 7250 }, { "epoch": 0.3333333333333333, "grad_norm": 35.43101119995117, "learning_rate": 2.5e-05, "loss": 1.3505, "step": 7300 }, { "epoch": 0.3356164383561644, "grad_norm": 26.726581573486328, "learning_rate": 2.4965753424657534e-05, "loss": 1.0832, "step": 7350 }, { "epoch": 0.3378995433789954, "grad_norm": 22.066526412963867, "learning_rate": 2.4931506849315067e-05, "loss": 1.2593, "step": 7400 }, { "epoch": 0.3401826484018265, "grad_norm": 9.076361656188965, "learning_rate": 2.4897260273972604e-05, "loss": 0.9935, "step": 7450 }, { "epoch": 0.3424657534246575, "grad_norm": 32.710853576660156, "learning_rate": 2.4863013698630137e-05, "loss": 1.2397, "step": 7500 }, { "epoch": 0.3447488584474886, "grad_norm": 24.153223037719727, "learning_rate": 2.4828767123287673e-05, "loss": 1.1828, "step": 7550 }, { "epoch": 0.3470319634703196, "grad_norm": 21.1640682220459, "learning_rate": 2.4794520547945206e-05, "loss": 1.2872, "step": 7600 }, { "epoch": 0.3493150684931507, "grad_norm": 8.183709144592285, "learning_rate": 2.476027397260274e-05, "loss": 1.0255, "step": 7650 }, { "epoch": 0.3515981735159817, "grad_norm": 32.05094528198242, "learning_rate": 2.4726027397260276e-05, "loss": 1.1408, "step": 7700 }, { "epoch": 0.3538812785388128, "grad_norm": 42.780330657958984, "learning_rate": 2.469178082191781e-05, "loss": 1.0889, "step": 7750 }, { "epoch": 0.3561643835616438, "grad_norm": 12.330619812011719, "learning_rate": 2.4657534246575342e-05, "loss": 1.1555, "step": 7800 }, { "epoch": 0.3584474885844749, "grad_norm": 52.48133850097656, "learning_rate": 2.4623287671232878e-05, "loss": 0.9871, "step": 7850 }, { "epoch": 0.3607305936073059, "grad_norm": 17.115123748779297, "learning_rate": 2.458904109589041e-05, "loss": 1.2171, "step": 7900 }, { "epoch": 0.363013698630137, "grad_norm": 24.72197914123535, "learning_rate": 2.4554794520547948e-05, "loss": 0.9396, "step": 7950 }, { "epoch": 0.365296803652968, "grad_norm": 59.79161834716797, "learning_rate": 2.452054794520548e-05, "loss": 1.161, "step": 8000 }, { "epoch": 0.3675799086757991, "grad_norm": 27.012365341186523, "learning_rate": 2.4486301369863014e-05, "loss": 1.2269, "step": 8050 }, { "epoch": 0.3698630136986301, "grad_norm": 10.282275199890137, "learning_rate": 2.445205479452055e-05, "loss": 1.0582, "step": 8100 }, { "epoch": 0.3721461187214612, "grad_norm": 16.697723388671875, "learning_rate": 2.4417808219178083e-05, "loss": 1.1504, "step": 8150 }, { "epoch": 0.3744292237442922, "grad_norm": 19.30514144897461, "learning_rate": 2.4383561643835616e-05, "loss": 1.0279, "step": 8200 }, { "epoch": 0.3767123287671233, "grad_norm": 49.45153045654297, "learning_rate": 2.4349315068493153e-05, "loss": 1.0799, "step": 8250 }, { "epoch": 0.3789954337899543, "grad_norm": 33.181060791015625, "learning_rate": 2.4315068493150686e-05, "loss": 1.1191, "step": 8300 }, { "epoch": 0.3812785388127854, "grad_norm": 79.8065185546875, "learning_rate": 2.4280821917808222e-05, "loss": 1.1442, "step": 8350 }, { "epoch": 0.3835616438356164, "grad_norm": 48.79990005493164, "learning_rate": 2.4246575342465755e-05, "loss": 1.349, "step": 8400 }, { "epoch": 0.3858447488584475, "grad_norm": 56.24677658081055, "learning_rate": 2.4212328767123288e-05, "loss": 1.0238, "step": 8450 }, { "epoch": 0.3881278538812785, "grad_norm": 12.791101455688477, "learning_rate": 2.4178082191780825e-05, "loss": 1.0045, "step": 8500 }, { "epoch": 0.3904109589041096, "grad_norm": 13.716742515563965, "learning_rate": 2.4143835616438358e-05, "loss": 1.0964, "step": 8550 }, { "epoch": 0.3926940639269406, "grad_norm": 6.694267272949219, "learning_rate": 2.410958904109589e-05, "loss": 0.9396, "step": 8600 }, { "epoch": 0.3949771689497717, "grad_norm": 19.237689971923828, "learning_rate": 2.4075342465753427e-05, "loss": 1.0931, "step": 8650 }, { "epoch": 0.3972602739726027, "grad_norm": 23.344566345214844, "learning_rate": 2.404109589041096e-05, "loss": 1.3482, "step": 8700 }, { "epoch": 0.3995433789954338, "grad_norm": 9.369404792785645, "learning_rate": 2.4006849315068497e-05, "loss": 1.0205, "step": 8750 }, { "epoch": 0.4018264840182648, "grad_norm": 20.9238224029541, "learning_rate": 2.397260273972603e-05, "loss": 1.0606, "step": 8800 }, { "epoch": 0.4041095890410959, "grad_norm": 66.00227355957031, "learning_rate": 2.3938356164383563e-05, "loss": 1.103, "step": 8850 }, { "epoch": 0.4063926940639269, "grad_norm": 5.158641815185547, "learning_rate": 2.39041095890411e-05, "loss": 1.0207, "step": 8900 }, { "epoch": 0.408675799086758, "grad_norm": 19.67837142944336, "learning_rate": 2.3869863013698632e-05, "loss": 1.108, "step": 8950 }, { "epoch": 0.410958904109589, "grad_norm": 48.31447219848633, "learning_rate": 2.3835616438356165e-05, "loss": 1.0708, "step": 9000 }, { "epoch": 0.4132420091324201, "grad_norm": 18.173908233642578, "learning_rate": 2.38013698630137e-05, "loss": 1.0449, "step": 9050 }, { "epoch": 0.4155251141552511, "grad_norm": 57.505226135253906, "learning_rate": 2.376712328767123e-05, "loss": 1.1911, "step": 9100 }, { "epoch": 0.4178082191780822, "grad_norm": 39.649169921875, "learning_rate": 2.3732876712328768e-05, "loss": 1.1708, "step": 9150 }, { "epoch": 0.4200913242009132, "grad_norm": 7.622274398803711, "learning_rate": 2.36986301369863e-05, "loss": 0.9928, "step": 9200 }, { "epoch": 0.4223744292237443, "grad_norm": 17.452634811401367, "learning_rate": 2.3664383561643834e-05, "loss": 0.9558, "step": 9250 }, { "epoch": 0.4246575342465753, "grad_norm": 59.85329055786133, "learning_rate": 2.363013698630137e-05, "loss": 1.0805, "step": 9300 }, { "epoch": 0.4269406392694064, "grad_norm": 9.634559631347656, "learning_rate": 2.3595890410958903e-05, "loss": 0.9829, "step": 9350 }, { "epoch": 0.4292237442922374, "grad_norm": 1.924180507659912, "learning_rate": 2.3561643835616436e-05, "loss": 1.0737, "step": 9400 }, { "epoch": 0.4315068493150685, "grad_norm": 31.276365280151367, "learning_rate": 2.3527397260273973e-05, "loss": 1.0905, "step": 9450 }, { "epoch": 0.4337899543378995, "grad_norm": 29.218658447265625, "learning_rate": 2.3493150684931506e-05, "loss": 1.2167, "step": 9500 }, { "epoch": 0.4360730593607306, "grad_norm": 42.0178108215332, "learning_rate": 2.3458904109589042e-05, "loss": 1.0009, "step": 9550 }, { "epoch": 0.4383561643835616, "grad_norm": 36.67307662963867, "learning_rate": 2.3424657534246575e-05, "loss": 1.1016, "step": 9600 }, { "epoch": 0.4406392694063927, "grad_norm": 21.634462356567383, "learning_rate": 2.339041095890411e-05, "loss": 1.0895, "step": 9650 }, { "epoch": 0.4429223744292237, "grad_norm": 20.32236671447754, "learning_rate": 2.3356164383561645e-05, "loss": 1.1135, "step": 9700 }, { "epoch": 0.4452054794520548, "grad_norm": 5.574302673339844, "learning_rate": 2.3321917808219178e-05, "loss": 1.1313, "step": 9750 }, { "epoch": 0.4474885844748858, "grad_norm": 5.582201957702637, "learning_rate": 2.328767123287671e-05, "loss": 1.0072, "step": 9800 }, { "epoch": 0.4497716894977169, "grad_norm": 58.499603271484375, "learning_rate": 2.3253424657534247e-05, "loss": 1.2044, "step": 9850 }, { "epoch": 0.4520547945205479, "grad_norm": 18.887069702148438, "learning_rate": 2.321917808219178e-05, "loss": 1.0175, "step": 9900 }, { "epoch": 0.454337899543379, "grad_norm": 33.383880615234375, "learning_rate": 2.3184931506849317e-05, "loss": 1.1733, "step": 9950 }, { "epoch": 0.45662100456621, "grad_norm": 85.34075927734375, "learning_rate": 2.315068493150685e-05, "loss": 1.1876, "step": 10000 }, { "epoch": 0.4589041095890411, "grad_norm": 22.957134246826172, "learning_rate": 2.3116438356164383e-05, "loss": 1.074, "step": 10050 }, { "epoch": 0.4611872146118721, "grad_norm": 57.47038269042969, "learning_rate": 2.308219178082192e-05, "loss": 0.9007, "step": 10100 }, { "epoch": 0.4634703196347032, "grad_norm": 11.614636421203613, "learning_rate": 2.3047945205479452e-05, "loss": 1.0877, "step": 10150 }, { "epoch": 0.4657534246575342, "grad_norm": 8.970778465270996, "learning_rate": 2.3013698630136985e-05, "loss": 1.3286, "step": 10200 }, { "epoch": 0.4680365296803653, "grad_norm": 43.425514221191406, "learning_rate": 2.2979452054794522e-05, "loss": 1.1883, "step": 10250 }, { "epoch": 0.4703196347031963, "grad_norm": 12.85531234741211, "learning_rate": 2.2945205479452055e-05, "loss": 1.0921, "step": 10300 }, { "epoch": 0.4726027397260274, "grad_norm": 6.603804111480713, "learning_rate": 2.291095890410959e-05, "loss": 1.1883, "step": 10350 }, { "epoch": 0.4748858447488584, "grad_norm": 47.8962516784668, "learning_rate": 2.2876712328767124e-05, "loss": 1.1814, "step": 10400 }, { "epoch": 0.4771689497716895, "grad_norm": 59.406280517578125, "learning_rate": 2.2842465753424657e-05, "loss": 1.115, "step": 10450 }, { "epoch": 0.4794520547945205, "grad_norm": 43.54423522949219, "learning_rate": 2.2808219178082194e-05, "loss": 0.998, "step": 10500 }, { "epoch": 0.4817351598173516, "grad_norm": 17.37261962890625, "learning_rate": 2.2773972602739727e-05, "loss": 0.9542, "step": 10550 }, { "epoch": 0.4840182648401826, "grad_norm": 38.04684829711914, "learning_rate": 2.273972602739726e-05, "loss": 1.0195, "step": 10600 }, { "epoch": 0.4863013698630137, "grad_norm": 17.202608108520508, "learning_rate": 2.2705479452054796e-05, "loss": 1.109, "step": 10650 }, { "epoch": 0.4885844748858447, "grad_norm": 4.1588897705078125, "learning_rate": 2.267123287671233e-05, "loss": 1.0125, "step": 10700 }, { "epoch": 0.4908675799086758, "grad_norm": 10.435503005981445, "learning_rate": 2.2636986301369866e-05, "loss": 1.0044, "step": 10750 }, { "epoch": 0.4931506849315068, "grad_norm": 62.7014045715332, "learning_rate": 2.26027397260274e-05, "loss": 1.0457, "step": 10800 }, { "epoch": 0.4954337899543379, "grad_norm": 19.47702407836914, "learning_rate": 2.2568493150684932e-05, "loss": 1.0468, "step": 10850 }, { "epoch": 0.4977168949771689, "grad_norm": 42.62397766113281, "learning_rate": 2.253424657534247e-05, "loss": 1.0374, "step": 10900 }, { "epoch": 0.5, "grad_norm": 52.27951431274414, "learning_rate": 2.25e-05, "loss": 1.0967, "step": 10950 }, { "epoch": 0.502283105022831, "grad_norm": 8.714099884033203, "learning_rate": 2.2465753424657534e-05, "loss": 1.2492, "step": 11000 }, { "epoch": 0.5045662100456622, "grad_norm": 10.670646667480469, "learning_rate": 2.243150684931507e-05, "loss": 1.23, "step": 11050 }, { "epoch": 0.5068493150684932, "grad_norm": 30.391632080078125, "learning_rate": 2.2397260273972604e-05, "loss": 1.3576, "step": 11100 }, { "epoch": 0.5091324200913242, "grad_norm": 36.597408294677734, "learning_rate": 2.2363013698630137e-05, "loss": 0.9834, "step": 11150 }, { "epoch": 0.5114155251141552, "grad_norm": 58.925941467285156, "learning_rate": 2.2328767123287673e-05, "loss": 1.139, "step": 11200 }, { "epoch": 0.5136986301369864, "grad_norm": 52.20880126953125, "learning_rate": 2.2294520547945206e-05, "loss": 1.1037, "step": 11250 }, { "epoch": 0.5159817351598174, "grad_norm": 104.13248443603516, "learning_rate": 2.2260273972602743e-05, "loss": 1.0124, "step": 11300 }, { "epoch": 0.5182648401826484, "grad_norm": 37.718406677246094, "learning_rate": 2.2226027397260276e-05, "loss": 1.1873, "step": 11350 }, { "epoch": 0.5205479452054794, "grad_norm": 31.402559280395508, "learning_rate": 2.219178082191781e-05, "loss": 1.1506, "step": 11400 }, { "epoch": 0.5228310502283106, "grad_norm": 7.707183361053467, "learning_rate": 2.2157534246575345e-05, "loss": 1.2423, "step": 11450 }, { "epoch": 0.5251141552511416, "grad_norm": 45.80585861206055, "learning_rate": 2.212328767123288e-05, "loss": 1.1457, "step": 11500 }, { "epoch": 0.5273972602739726, "grad_norm": 58.7805061340332, "learning_rate": 2.208904109589041e-05, "loss": 1.0302, "step": 11550 }, { "epoch": 0.5296803652968036, "grad_norm": 29.0611515045166, "learning_rate": 2.2054794520547945e-05, "loss": 1.1852, "step": 11600 }, { "epoch": 0.5319634703196348, "grad_norm": 17.17108917236328, "learning_rate": 2.2020547945205478e-05, "loss": 1.0164, "step": 11650 }, { "epoch": 0.5342465753424658, "grad_norm": 28.332836151123047, "learning_rate": 2.1986301369863014e-05, "loss": 0.968, "step": 11700 }, { "epoch": 0.5365296803652968, "grad_norm": 7.278358459472656, "learning_rate": 2.1952054794520547e-05, "loss": 1.2058, "step": 11750 }, { "epoch": 0.5388127853881278, "grad_norm": 15.584250450134277, "learning_rate": 2.191780821917808e-05, "loss": 1.0892, "step": 11800 }, { "epoch": 0.541095890410959, "grad_norm": 22.4052791595459, "learning_rate": 2.1883561643835617e-05, "loss": 0.9984, "step": 11850 }, { "epoch": 0.54337899543379, "grad_norm": 31.585800170898438, "learning_rate": 2.184931506849315e-05, "loss": 1.2338, "step": 11900 }, { "epoch": 0.545662100456621, "grad_norm": 19.224178314208984, "learning_rate": 2.1815068493150683e-05, "loss": 1.0994, "step": 11950 }, { "epoch": 0.547945205479452, "grad_norm": 15.44568157196045, "learning_rate": 2.178082191780822e-05, "loss": 1.0938, "step": 12000 }, { "epoch": 0.5502283105022832, "grad_norm": 9.826760292053223, "learning_rate": 2.1746575342465752e-05, "loss": 1.1607, "step": 12050 }, { "epoch": 0.5525114155251142, "grad_norm": 8.327254295349121, "learning_rate": 2.171232876712329e-05, "loss": 1.0973, "step": 12100 }, { "epoch": 0.5547945205479452, "grad_norm": 39.339962005615234, "learning_rate": 2.167808219178082e-05, "loss": 1.1066, "step": 12150 }, { "epoch": 0.5570776255707762, "grad_norm": 27.773502349853516, "learning_rate": 2.1643835616438355e-05, "loss": 0.9664, "step": 12200 }, { "epoch": 0.5593607305936074, "grad_norm": 34.17534637451172, "learning_rate": 2.160958904109589e-05, "loss": 1.2806, "step": 12250 }, { "epoch": 0.5616438356164384, "grad_norm": 14.751806259155273, "learning_rate": 2.1575342465753424e-05, "loss": 1.0973, "step": 12300 }, { "epoch": 0.5639269406392694, "grad_norm": 39.10814666748047, "learning_rate": 2.1541095890410957e-05, "loss": 1.0821, "step": 12350 }, { "epoch": 0.5662100456621004, "grad_norm": 14.468631744384766, "learning_rate": 2.1506849315068494e-05, "loss": 0.8218, "step": 12400 }, { "epoch": 0.5684931506849316, "grad_norm": 30.599267959594727, "learning_rate": 2.1472602739726027e-05, "loss": 1.0412, "step": 12450 }, { "epoch": 0.5707762557077626, "grad_norm": 15.382964134216309, "learning_rate": 2.1438356164383563e-05, "loss": 0.9351, "step": 12500 }, { "epoch": 0.5730593607305936, "grad_norm": 44.646480560302734, "learning_rate": 2.1404109589041096e-05, "loss": 1.1423, "step": 12550 }, { "epoch": 0.5753424657534246, "grad_norm": 40.618309020996094, "learning_rate": 2.136986301369863e-05, "loss": 1.0657, "step": 12600 }, { "epoch": 0.5776255707762558, "grad_norm": 50.3376579284668, "learning_rate": 2.1335616438356166e-05, "loss": 0.9775, "step": 12650 }, { "epoch": 0.5799086757990868, "grad_norm": 60.17688751220703, "learning_rate": 2.13013698630137e-05, "loss": 1.1291, "step": 12700 }, { "epoch": 0.5821917808219178, "grad_norm": 11.148223876953125, "learning_rate": 2.126712328767123e-05, "loss": 1.1556, "step": 12750 }, { "epoch": 0.5844748858447488, "grad_norm": 20.931018829345703, "learning_rate": 2.1232876712328768e-05, "loss": 1.0614, "step": 12800 }, { "epoch": 0.58675799086758, "grad_norm": 29.79435157775879, "learning_rate": 2.11986301369863e-05, "loss": 1.2341, "step": 12850 }, { "epoch": 0.589041095890411, "grad_norm": 27.72561264038086, "learning_rate": 2.1164383561643838e-05, "loss": 0.9783, "step": 12900 }, { "epoch": 0.591324200913242, "grad_norm": 6.564560890197754, "learning_rate": 2.113013698630137e-05, "loss": 1.0293, "step": 12950 }, { "epoch": 0.593607305936073, "grad_norm": 68.2934799194336, "learning_rate": 2.1095890410958904e-05, "loss": 1.1324, "step": 13000 }, { "epoch": 0.5958904109589042, "grad_norm": 8.194302558898926, "learning_rate": 2.106164383561644e-05, "loss": 0.9099, "step": 13050 }, { "epoch": 0.5981735159817352, "grad_norm": 25.813316345214844, "learning_rate": 2.1027397260273973e-05, "loss": 1.0368, "step": 13100 }, { "epoch": 0.6004566210045662, "grad_norm": 19.83176612854004, "learning_rate": 2.0993150684931506e-05, "loss": 0.9496, "step": 13150 }, { "epoch": 0.6027397260273972, "grad_norm": 17.05171012878418, "learning_rate": 2.0958904109589043e-05, "loss": 1.1386, "step": 13200 }, { "epoch": 0.6050228310502284, "grad_norm": 12.28995418548584, "learning_rate": 2.0924657534246576e-05, "loss": 1.1897, "step": 13250 }, { "epoch": 0.6073059360730594, "grad_norm": 22.48783302307129, "learning_rate": 2.0890410958904112e-05, "loss": 1.1413, "step": 13300 }, { "epoch": 0.6095890410958904, "grad_norm": 37.44598388671875, "learning_rate": 2.0856164383561645e-05, "loss": 1.1138, "step": 13350 }, { "epoch": 0.6118721461187214, "grad_norm": 19.184656143188477, "learning_rate": 2.0821917808219178e-05, "loss": 1.0924, "step": 13400 }, { "epoch": 0.6141552511415526, "grad_norm": 30.34543228149414, "learning_rate": 2.0787671232876715e-05, "loss": 1.0382, "step": 13450 }, { "epoch": 0.6164383561643836, "grad_norm": 18.820228576660156, "learning_rate": 2.0753424657534248e-05, "loss": 1.0618, "step": 13500 }, { "epoch": 0.6187214611872146, "grad_norm": 3.0790977478027344, "learning_rate": 2.071917808219178e-05, "loss": 1.0412, "step": 13550 }, { "epoch": 0.6210045662100456, "grad_norm": 42.972923278808594, "learning_rate": 2.0684931506849317e-05, "loss": 0.9701, "step": 13600 }, { "epoch": 0.6232876712328768, "grad_norm": 13.229798316955566, "learning_rate": 2.065068493150685e-05, "loss": 0.9746, "step": 13650 }, { "epoch": 0.6255707762557078, "grad_norm": 87.67366790771484, "learning_rate": 2.0616438356164387e-05, "loss": 1.1411, "step": 13700 }, { "epoch": 0.6278538812785388, "grad_norm": 12.436323165893555, "learning_rate": 2.058219178082192e-05, "loss": 1.208, "step": 13750 }, { "epoch": 0.6301369863013698, "grad_norm": 19.009395599365234, "learning_rate": 2.0547945205479453e-05, "loss": 1.0635, "step": 13800 }, { "epoch": 0.632420091324201, "grad_norm": 6.680107593536377, "learning_rate": 2.051369863013699e-05, "loss": 1.0463, "step": 13850 }, { "epoch": 0.634703196347032, "grad_norm": 11.398260116577148, "learning_rate": 2.0479452054794522e-05, "loss": 0.9179, "step": 13900 }, { "epoch": 0.636986301369863, "grad_norm": 4.983315467834473, "learning_rate": 2.0445205479452055e-05, "loss": 1.0699, "step": 13950 }, { "epoch": 0.639269406392694, "grad_norm": 29.882274627685547, "learning_rate": 2.041095890410959e-05, "loss": 1.1485, "step": 14000 }, { "epoch": 0.6415525114155252, "grad_norm": 9.104654312133789, "learning_rate": 2.0376712328767125e-05, "loss": 0.9783, "step": 14050 }, { "epoch": 0.6438356164383562, "grad_norm": 8.866716384887695, "learning_rate": 2.034246575342466e-05, "loss": 0.9843, "step": 14100 }, { "epoch": 0.6461187214611872, "grad_norm": 20.7504825592041, "learning_rate": 2.0308219178082194e-05, "loss": 0.9999, "step": 14150 }, { "epoch": 0.6484018264840182, "grad_norm": 16.239980697631836, "learning_rate": 2.0273972602739724e-05, "loss": 0.9305, "step": 14200 }, { "epoch": 0.6506849315068494, "grad_norm": 52.316673278808594, "learning_rate": 2.023972602739726e-05, "loss": 0.9185, "step": 14250 }, { "epoch": 0.6529680365296804, "grad_norm": 13.330124855041504, "learning_rate": 2.0205479452054793e-05, "loss": 0.9943, "step": 14300 }, { "epoch": 0.6552511415525114, "grad_norm": 13.177003860473633, "learning_rate": 2.0171232876712326e-05, "loss": 0.7542, "step": 14350 }, { "epoch": 0.6575342465753424, "grad_norm": 36.442108154296875, "learning_rate": 2.0136986301369863e-05, "loss": 0.8722, "step": 14400 }, { "epoch": 0.6598173515981736, "grad_norm": 26.251983642578125, "learning_rate": 2.0102739726027396e-05, "loss": 0.9657, "step": 14450 }, { "epoch": 0.6621004566210046, "grad_norm": 35.95085144042969, "learning_rate": 2.0068493150684932e-05, "loss": 1.0656, "step": 14500 }, { "epoch": 0.6643835616438356, "grad_norm": 29.169719696044922, "learning_rate": 2.0034246575342465e-05, "loss": 1.076, "step": 14550 }, { "epoch": 0.6666666666666666, "grad_norm": 49.64678955078125, "learning_rate": 1.9999999999999998e-05, "loss": 0.9267, "step": 14600 }, { "epoch": 0.6689497716894978, "grad_norm": 25.249004364013672, "learning_rate": 1.9965753424657535e-05, "loss": 1.1205, "step": 14650 }, { "epoch": 0.6712328767123288, "grad_norm": 21.96654510498047, "learning_rate": 1.9931506849315068e-05, "loss": 0.8928, "step": 14700 }, { "epoch": 0.6735159817351598, "grad_norm": 78.21800994873047, "learning_rate": 1.98972602739726e-05, "loss": 0.9646, "step": 14750 }, { "epoch": 0.6757990867579908, "grad_norm": 49.2692756652832, "learning_rate": 1.9863013698630137e-05, "loss": 1.0115, "step": 14800 }, { "epoch": 0.678082191780822, "grad_norm": 35.43125915527344, "learning_rate": 1.982876712328767e-05, "loss": 1.0473, "step": 14850 }, { "epoch": 0.680365296803653, "grad_norm": 18.292205810546875, "learning_rate": 1.9794520547945207e-05, "loss": 0.7609, "step": 14900 }, { "epoch": 0.682648401826484, "grad_norm": 18.42896842956543, "learning_rate": 1.976027397260274e-05, "loss": 0.989, "step": 14950 }, { "epoch": 0.684931506849315, "grad_norm": 25.584640502929688, "learning_rate": 1.9726027397260273e-05, "loss": 1.1391, "step": 15000 }, { "epoch": 0.6872146118721462, "grad_norm": 12.116101264953613, "learning_rate": 1.969178082191781e-05, "loss": 1.09, "step": 15050 }, { "epoch": 0.6894977168949772, "grad_norm": 4.520615100860596, "learning_rate": 1.9657534246575342e-05, "loss": 0.9834, "step": 15100 }, { "epoch": 0.6917808219178082, "grad_norm": 16.461069107055664, "learning_rate": 1.9623287671232875e-05, "loss": 0.8777, "step": 15150 }, { "epoch": 0.6940639269406392, "grad_norm": 41.71174240112305, "learning_rate": 1.9589041095890412e-05, "loss": 1.2619, "step": 15200 }, { "epoch": 0.6963470319634704, "grad_norm": 10.733747482299805, "learning_rate": 1.9554794520547945e-05, "loss": 1.0843, "step": 15250 }, { "epoch": 0.6986301369863014, "grad_norm": 34.77432632446289, "learning_rate": 1.952054794520548e-05, "loss": 0.9292, "step": 15300 }, { "epoch": 0.7009132420091324, "grad_norm": 46.95319747924805, "learning_rate": 1.9486301369863014e-05, "loss": 1.0058, "step": 15350 }, { "epoch": 0.7031963470319634, "grad_norm": 1.2955800294876099, "learning_rate": 1.9452054794520547e-05, "loss": 0.9672, "step": 15400 }, { "epoch": 0.7054794520547946, "grad_norm": 16.37827491760254, "learning_rate": 1.9417808219178084e-05, "loss": 0.9386, "step": 15450 }, { "epoch": 0.7077625570776256, "grad_norm": 41.87287139892578, "learning_rate": 1.9383561643835617e-05, "loss": 1.0772, "step": 15500 }, { "epoch": 0.7100456621004566, "grad_norm": 8.469085693359375, "learning_rate": 1.934931506849315e-05, "loss": 0.9201, "step": 15550 }, { "epoch": 0.7123287671232876, "grad_norm": 5.279160976409912, "learning_rate": 1.9315068493150686e-05, "loss": 1.2076, "step": 15600 }, { "epoch": 0.7146118721461188, "grad_norm": 24.052440643310547, "learning_rate": 1.928082191780822e-05, "loss": 1.079, "step": 15650 }, { "epoch": 0.7168949771689498, "grad_norm": 43.76991271972656, "learning_rate": 1.9246575342465756e-05, "loss": 0.9909, "step": 15700 }, { "epoch": 0.7191780821917808, "grad_norm": 0.4643230438232422, "learning_rate": 1.921232876712329e-05, "loss": 1.0314, "step": 15750 }, { "epoch": 0.7214611872146118, "grad_norm": 31.16683006286621, "learning_rate": 1.9178082191780822e-05, "loss": 1.0446, "step": 15800 }, { "epoch": 0.723744292237443, "grad_norm": 2.468339443206787, "learning_rate": 1.9143835616438358e-05, "loss": 0.9801, "step": 15850 }, { "epoch": 0.726027397260274, "grad_norm": 18.055511474609375, "learning_rate": 1.910958904109589e-05, "loss": 0.9679, "step": 15900 }, { "epoch": 0.728310502283105, "grad_norm": 11.843624114990234, "learning_rate": 1.9075342465753424e-05, "loss": 1.049, "step": 15950 }, { "epoch": 0.730593607305936, "grad_norm": 28.83637237548828, "learning_rate": 1.904109589041096e-05, "loss": 1.0862, "step": 16000 }, { "epoch": 0.7328767123287672, "grad_norm": 41.827579498291016, "learning_rate": 1.9006849315068494e-05, "loss": 0.9812, "step": 16050 }, { "epoch": 0.7351598173515982, "grad_norm": 19.110126495361328, "learning_rate": 1.897260273972603e-05, "loss": 1.2085, "step": 16100 }, { "epoch": 0.7374429223744292, "grad_norm": 7.440555572509766, "learning_rate": 1.8938356164383563e-05, "loss": 1.0045, "step": 16150 }, { "epoch": 0.7397260273972602, "grad_norm": 14.894591331481934, "learning_rate": 1.8904109589041096e-05, "loss": 1.0214, "step": 16200 }, { "epoch": 0.7420091324200914, "grad_norm": 50.265140533447266, "learning_rate": 1.8869863013698633e-05, "loss": 1.0115, "step": 16250 }, { "epoch": 0.7442922374429224, "grad_norm": 48.24840545654297, "learning_rate": 1.8835616438356166e-05, "loss": 1.022, "step": 16300 }, { "epoch": 0.7465753424657534, "grad_norm": 23.087678909301758, "learning_rate": 1.88013698630137e-05, "loss": 0.9043, "step": 16350 }, { "epoch": 0.7488584474885844, "grad_norm": 41.646461486816406, "learning_rate": 1.8767123287671235e-05, "loss": 1.0964, "step": 16400 }, { "epoch": 0.7511415525114156, "grad_norm": 18.607566833496094, "learning_rate": 1.8732876712328768e-05, "loss": 1.1307, "step": 16450 }, { "epoch": 0.7534246575342466, "grad_norm": 19.668004989624023, "learning_rate": 1.8698630136986305e-05, "loss": 0.9917, "step": 16500 }, { "epoch": 0.7557077625570776, "grad_norm": 81.84849548339844, "learning_rate": 1.8664383561643838e-05, "loss": 1.1245, "step": 16550 }, { "epoch": 0.7579908675799086, "grad_norm": 17.170801162719727, "learning_rate": 1.863013698630137e-05, "loss": 0.8943, "step": 16600 }, { "epoch": 0.7602739726027398, "grad_norm": 20.587200164794922, "learning_rate": 1.8595890410958907e-05, "loss": 0.9715, "step": 16650 }, { "epoch": 0.7625570776255708, "grad_norm": 65.86396026611328, "learning_rate": 1.856164383561644e-05, "loss": 1.0448, "step": 16700 }, { "epoch": 0.7648401826484018, "grad_norm": 16.04901695251465, "learning_rate": 1.8527397260273973e-05, "loss": 1.0682, "step": 16750 }, { "epoch": 0.7671232876712328, "grad_norm": 19.407167434692383, "learning_rate": 1.8493150684931506e-05, "loss": 0.9607, "step": 16800 }, { "epoch": 0.769406392694064, "grad_norm": 13.71429443359375, "learning_rate": 1.845890410958904e-05, "loss": 1.0604, "step": 16850 }, { "epoch": 0.771689497716895, "grad_norm": 51.34487533569336, "learning_rate": 1.8424657534246576e-05, "loss": 0.8779, "step": 16900 }, { "epoch": 0.773972602739726, "grad_norm": 13.411955833435059, "learning_rate": 1.839041095890411e-05, "loss": 1.0547, "step": 16950 }, { "epoch": 0.776255707762557, "grad_norm": 60.41900634765625, "learning_rate": 1.8356164383561642e-05, "loss": 1.0141, "step": 17000 }, { "epoch": 0.7785388127853882, "grad_norm": 65.25275421142578, "learning_rate": 1.832191780821918e-05, "loss": 1.0541, "step": 17050 }, { "epoch": 0.7808219178082192, "grad_norm": 64.50421905517578, "learning_rate": 1.828767123287671e-05, "loss": 1.0467, "step": 17100 }, { "epoch": 0.7831050228310502, "grad_norm": 35.63545608520508, "learning_rate": 1.8253424657534244e-05, "loss": 1.0431, "step": 17150 }, { "epoch": 0.7853881278538812, "grad_norm": 8.089126586914062, "learning_rate": 1.821917808219178e-05, "loss": 1.0258, "step": 17200 }, { "epoch": 0.7876712328767124, "grad_norm": 33.44828414916992, "learning_rate": 1.8184931506849314e-05, "loss": 0.9516, "step": 17250 }, { "epoch": 0.7899543378995434, "grad_norm": 52.08647537231445, "learning_rate": 1.815068493150685e-05, "loss": 1.0529, "step": 17300 }, { "epoch": 0.7922374429223744, "grad_norm": 33.359886169433594, "learning_rate": 1.8116438356164383e-05, "loss": 1.0985, "step": 17350 }, { "epoch": 0.7945205479452054, "grad_norm": 43.1573486328125, "learning_rate": 1.8082191780821916e-05, "loss": 0.987, "step": 17400 }, { "epoch": 0.7968036529680366, "grad_norm": 41.887718200683594, "learning_rate": 1.8047945205479453e-05, "loss": 0.9562, "step": 17450 }, { "epoch": 0.7990867579908676, "grad_norm": 4.935914993286133, "learning_rate": 1.8013698630136986e-05, "loss": 1.011, "step": 17500 }, { "epoch": 0.8013698630136986, "grad_norm": 17.437244415283203, "learning_rate": 1.797945205479452e-05, "loss": 1.1939, "step": 17550 }, { "epoch": 0.8036529680365296, "grad_norm": 38.83684158325195, "learning_rate": 1.7945205479452055e-05, "loss": 1.0831, "step": 17600 }, { "epoch": 0.8059360730593608, "grad_norm": 389.5946960449219, "learning_rate": 1.791095890410959e-05, "loss": 0.9442, "step": 17650 }, { "epoch": 0.8082191780821918, "grad_norm": 55.63192367553711, "learning_rate": 1.7876712328767125e-05, "loss": 0.8581, "step": 17700 }, { "epoch": 0.8105022831050228, "grad_norm": 12.910749435424805, "learning_rate": 1.7842465753424658e-05, "loss": 0.9023, "step": 17750 }, { "epoch": 0.8127853881278538, "grad_norm": 25.35328483581543, "learning_rate": 1.780821917808219e-05, "loss": 1.0642, "step": 17800 }, { "epoch": 0.815068493150685, "grad_norm": 14.35108470916748, "learning_rate": 1.7773972602739727e-05, "loss": 0.9517, "step": 17850 }, { "epoch": 0.817351598173516, "grad_norm": 16.878704071044922, "learning_rate": 1.773972602739726e-05, "loss": 1.0333, "step": 17900 }, { "epoch": 0.819634703196347, "grad_norm": 4.217007160186768, "learning_rate": 1.7705479452054793e-05, "loss": 1.048, "step": 17950 }, { "epoch": 0.821917808219178, "grad_norm": 7.6629767417907715, "learning_rate": 1.767123287671233e-05, "loss": 0.9847, "step": 18000 }, { "epoch": 0.8242009132420092, "grad_norm": 8.383101463317871, "learning_rate": 1.7636986301369863e-05, "loss": 0.8269, "step": 18050 }, { "epoch": 0.8264840182648402, "grad_norm": 21.882862091064453, "learning_rate": 1.76027397260274e-05, "loss": 1.0138, "step": 18100 }, { "epoch": 0.8287671232876712, "grad_norm": 15.08014965057373, "learning_rate": 1.7568493150684932e-05, "loss": 1.0072, "step": 18150 }, { "epoch": 0.8310502283105022, "grad_norm": 9.188797950744629, "learning_rate": 1.7534246575342465e-05, "loss": 0.998, "step": 18200 }, { "epoch": 0.8333333333333334, "grad_norm": 8.108163833618164, "learning_rate": 1.7500000000000002e-05, "loss": 0.8591, "step": 18250 }, { "epoch": 0.8356164383561644, "grad_norm": 8.082106590270996, "learning_rate": 1.7465753424657535e-05, "loss": 0.9486, "step": 18300 }, { "epoch": 0.8378995433789954, "grad_norm": 9.28232479095459, "learning_rate": 1.7431506849315068e-05, "loss": 1.1733, "step": 18350 }, { "epoch": 0.8401826484018264, "grad_norm": 42.009498596191406, "learning_rate": 1.7397260273972604e-05, "loss": 1.0863, "step": 18400 }, { "epoch": 0.8424657534246576, "grad_norm": 16.338623046875, "learning_rate": 1.7363013698630137e-05, "loss": 0.8428, "step": 18450 }, { "epoch": 0.8447488584474886, "grad_norm": 22.031343460083008, "learning_rate": 1.7328767123287674e-05, "loss": 0.9678, "step": 18500 }, { "epoch": 0.8470319634703196, "grad_norm": 60.101600646972656, "learning_rate": 1.7294520547945207e-05, "loss": 1.0961, "step": 18550 }, { "epoch": 0.8493150684931506, "grad_norm": 17.335065841674805, "learning_rate": 1.726027397260274e-05, "loss": 0.9917, "step": 18600 }, { "epoch": 0.8515981735159818, "grad_norm": 1.8705586194992065, "learning_rate": 1.7226027397260276e-05, "loss": 0.9031, "step": 18650 }, { "epoch": 0.8538812785388128, "grad_norm": 8.452926635742188, "learning_rate": 1.719178082191781e-05, "loss": 0.9161, "step": 18700 }, { "epoch": 0.8561643835616438, "grad_norm": 22.091020584106445, "learning_rate": 1.7157534246575342e-05, "loss": 0.9697, "step": 18750 }, { "epoch": 0.8584474885844748, "grad_norm": 14.892813682556152, "learning_rate": 1.712328767123288e-05, "loss": 1.1043, "step": 18800 }, { "epoch": 0.860730593607306, "grad_norm": 5.6726274490356445, "learning_rate": 1.7089041095890412e-05, "loss": 0.9272, "step": 18850 }, { "epoch": 0.863013698630137, "grad_norm": 4.598392963409424, "learning_rate": 1.705479452054795e-05, "loss": 1.0438, "step": 18900 }, { "epoch": 0.865296803652968, "grad_norm": 10.54848575592041, "learning_rate": 1.702054794520548e-05, "loss": 1.068, "step": 18950 }, { "epoch": 0.867579908675799, "grad_norm": 43.7364616394043, "learning_rate": 1.6986301369863014e-05, "loss": 0.9575, "step": 19000 }, { "epoch": 0.8698630136986302, "grad_norm": 17.5739803314209, "learning_rate": 1.695205479452055e-05, "loss": 0.933, "step": 19050 }, { "epoch": 0.8721461187214612, "grad_norm": 2.4345619678497314, "learning_rate": 1.6917808219178084e-05, "loss": 0.8702, "step": 19100 }, { "epoch": 0.8744292237442922, "grad_norm": 27.58513069152832, "learning_rate": 1.6883561643835617e-05, "loss": 0.9842, "step": 19150 }, { "epoch": 0.8767123287671232, "grad_norm": 44.58627700805664, "learning_rate": 1.6849315068493153e-05, "loss": 0.9117, "step": 19200 }, { "epoch": 0.8789954337899544, "grad_norm": 23.16618537902832, "learning_rate": 1.6815068493150686e-05, "loss": 0.9699, "step": 19250 }, { "epoch": 0.8812785388127854, "grad_norm": 10.979578971862793, "learning_rate": 1.6780821917808223e-05, "loss": 0.9397, "step": 19300 }, { "epoch": 0.8835616438356164, "grad_norm": 12.799473762512207, "learning_rate": 1.6746575342465753e-05, "loss": 1.0797, "step": 19350 }, { "epoch": 0.8858447488584474, "grad_norm": 6.065539836883545, "learning_rate": 1.6712328767123286e-05, "loss": 1.1177, "step": 19400 }, { "epoch": 0.8881278538812786, "grad_norm": 14.423439979553223, "learning_rate": 1.6678082191780822e-05, "loss": 0.8818, "step": 19450 }, { "epoch": 0.8904109589041096, "grad_norm": 1.1675946712493896, "learning_rate": 1.6643835616438355e-05, "loss": 0.8819, "step": 19500 }, { "epoch": 0.8926940639269406, "grad_norm": 18.09140396118164, "learning_rate": 1.6609589041095888e-05, "loss": 0.9183, "step": 19550 }, { "epoch": 0.8949771689497716, "grad_norm": 11.98416805267334, "learning_rate": 1.6575342465753425e-05, "loss": 1.0715, "step": 19600 }, { "epoch": 0.8972602739726028, "grad_norm": 16.66376495361328, "learning_rate": 1.6541095890410958e-05, "loss": 0.7896, "step": 19650 }, { "epoch": 0.8995433789954338, "grad_norm": 17.945493698120117, "learning_rate": 1.6506849315068494e-05, "loss": 0.8961, "step": 19700 }, { "epoch": 0.9018264840182648, "grad_norm": 34.556827545166016, "learning_rate": 1.6472602739726027e-05, "loss": 0.9543, "step": 19750 }, { "epoch": 0.9041095890410958, "grad_norm": 16.06093406677246, "learning_rate": 1.643835616438356e-05, "loss": 0.913, "step": 19800 }, { "epoch": 0.906392694063927, "grad_norm": 28.835208892822266, "learning_rate": 1.6404109589041096e-05, "loss": 1.068, "step": 19850 }, { "epoch": 0.908675799086758, "grad_norm": 3.6414895057678223, "learning_rate": 1.636986301369863e-05, "loss": 0.9695, "step": 19900 }, { "epoch": 0.910958904109589, "grad_norm": 16.142841339111328, "learning_rate": 1.6335616438356163e-05, "loss": 1.0057, "step": 19950 }, { "epoch": 0.91324200913242, "grad_norm": 15.585680961608887, "learning_rate": 1.63013698630137e-05, "loss": 0.964, "step": 20000 }, { "epoch": 0.9155251141552512, "grad_norm": 23.79058265686035, "learning_rate": 1.6267123287671232e-05, "loss": 0.8243, "step": 20050 }, { "epoch": 0.9178082191780822, "grad_norm": 14.258800506591797, "learning_rate": 1.623287671232877e-05, "loss": 0.9247, "step": 20100 }, { "epoch": 0.9200913242009132, "grad_norm": 16.076623916625977, "learning_rate": 1.61986301369863e-05, "loss": 0.8446, "step": 20150 }, { "epoch": 0.9223744292237442, "grad_norm": 19.330589294433594, "learning_rate": 1.6164383561643835e-05, "loss": 1.0295, "step": 20200 }, { "epoch": 0.9246575342465754, "grad_norm": 29.074434280395508, "learning_rate": 1.613013698630137e-05, "loss": 1.0043, "step": 20250 }, { "epoch": 0.9269406392694064, "grad_norm": 19.37755012512207, "learning_rate": 1.6095890410958904e-05, "loss": 0.8284, "step": 20300 }, { "epoch": 0.9292237442922374, "grad_norm": 17.540924072265625, "learning_rate": 1.6061643835616437e-05, "loss": 0.9585, "step": 20350 }, { "epoch": 0.9315068493150684, "grad_norm": 15.943336486816406, "learning_rate": 1.6027397260273974e-05, "loss": 0.9534, "step": 20400 }, { "epoch": 0.9337899543378996, "grad_norm": 14.471035957336426, "learning_rate": 1.5993150684931507e-05, "loss": 0.9346, "step": 20450 }, { "epoch": 0.9360730593607306, "grad_norm": 2.5866785049438477, "learning_rate": 1.5958904109589043e-05, "loss": 0.874, "step": 20500 }, { "epoch": 0.9383561643835616, "grad_norm": 23.115089416503906, "learning_rate": 1.5924657534246576e-05, "loss": 1.0657, "step": 20550 }, { "epoch": 0.9406392694063926, "grad_norm": 37.99090576171875, "learning_rate": 1.589041095890411e-05, "loss": 0.8484, "step": 20600 }, { "epoch": 0.9429223744292238, "grad_norm": 8.818378448486328, "learning_rate": 1.5856164383561646e-05, "loss": 0.8682, "step": 20650 }, { "epoch": 0.9452054794520548, "grad_norm": 64.82261657714844, "learning_rate": 1.582191780821918e-05, "loss": 0.9238, "step": 20700 }, { "epoch": 0.9474885844748858, "grad_norm": 4.82705545425415, "learning_rate": 1.578767123287671e-05, "loss": 1.0183, "step": 20750 }, { "epoch": 0.9497716894977168, "grad_norm": 35.37221908569336, "learning_rate": 1.5753424657534248e-05, "loss": 0.9661, "step": 20800 }, { "epoch": 0.952054794520548, "grad_norm": 59.23238754272461, "learning_rate": 1.571917808219178e-05, "loss": 1.1756, "step": 20850 }, { "epoch": 0.954337899543379, "grad_norm": 5.600244045257568, "learning_rate": 1.5684931506849318e-05, "loss": 0.9528, "step": 20900 }, { "epoch": 0.95662100456621, "grad_norm": 36.54751968383789, "learning_rate": 1.565068493150685e-05, "loss": 0.8113, "step": 20950 }, { "epoch": 0.958904109589041, "grad_norm": 21.206743240356445, "learning_rate": 1.5616438356164384e-05, "loss": 1.0184, "step": 21000 }, { "epoch": 0.9611872146118722, "grad_norm": 5.483078479766846, "learning_rate": 1.558219178082192e-05, "loss": 1.0164, "step": 21050 }, { "epoch": 0.9634703196347032, "grad_norm": 10.294635772705078, "learning_rate": 1.5547945205479453e-05, "loss": 0.8765, "step": 21100 }, { "epoch": 0.9657534246575342, "grad_norm": 15.811431884765625, "learning_rate": 1.5513698630136986e-05, "loss": 1.0893, "step": 21150 }, { "epoch": 0.9680365296803652, "grad_norm": 12.22139835357666, "learning_rate": 1.5479452054794523e-05, "loss": 0.8607, "step": 21200 }, { "epoch": 0.9703196347031964, "grad_norm": 32.24748229980469, "learning_rate": 1.5445205479452056e-05, "loss": 0.9461, "step": 21250 }, { "epoch": 0.9726027397260274, "grad_norm": 105.09614562988281, "learning_rate": 1.5410958904109592e-05, "loss": 1.0007, "step": 21300 }, { "epoch": 0.9748858447488584, "grad_norm": 39.1617546081543, "learning_rate": 1.5376712328767125e-05, "loss": 0.8074, "step": 21350 }, { "epoch": 0.9771689497716894, "grad_norm": 54.6515007019043, "learning_rate": 1.5342465753424658e-05, "loss": 1.0728, "step": 21400 }, { "epoch": 0.9794520547945206, "grad_norm": 11.429344177246094, "learning_rate": 1.5308219178082195e-05, "loss": 1.0846, "step": 21450 }, { "epoch": 0.9817351598173516, "grad_norm": 35.74282455444336, "learning_rate": 1.5273972602739728e-05, "loss": 0.9805, "step": 21500 }, { "epoch": 0.9840182648401826, "grad_norm": 28.15215301513672, "learning_rate": 1.5239726027397259e-05, "loss": 1.057, "step": 21550 }, { "epoch": 0.9863013698630136, "grad_norm": 24.915042877197266, "learning_rate": 1.5205479452054795e-05, "loss": 0.9953, "step": 21600 }, { "epoch": 0.9885844748858448, "grad_norm": 54.70051574707031, "learning_rate": 1.5171232876712328e-05, "loss": 0.8781, "step": 21650 }, { "epoch": 0.9908675799086758, "grad_norm": 57.645572662353516, "learning_rate": 1.5136986301369865e-05, "loss": 0.9031, "step": 21700 }, { "epoch": 0.9931506849315068, "grad_norm": 49.95241165161133, "learning_rate": 1.5102739726027398e-05, "loss": 1.1313, "step": 21750 }, { "epoch": 0.9954337899543378, "grad_norm": 11.169427871704102, "learning_rate": 1.5068493150684931e-05, "loss": 0.7831, "step": 21800 }, { "epoch": 0.997716894977169, "grad_norm": 1.9000864028930664, "learning_rate": 1.5034246575342467e-05, "loss": 0.9135, "step": 21850 }, { "epoch": 1.0, "grad_norm": 8.12338638305664, "learning_rate": 1.5e-05, "loss": 0.8517, "step": 21900 }, { "epoch": 1.0022831050228311, "grad_norm": 9.123151779174805, "learning_rate": 1.4965753424657535e-05, "loss": 0.8109, "step": 21950 }, { "epoch": 1.004566210045662, "grad_norm": 1.5693093538284302, "learning_rate": 1.4931506849315068e-05, "loss": 0.637, "step": 22000 }, { "epoch": 1.0068493150684932, "grad_norm": 57.09550094604492, "learning_rate": 1.4897260273972603e-05, "loss": 0.7138, "step": 22050 }, { "epoch": 1.009132420091324, "grad_norm": 22.930618286132812, "learning_rate": 1.4863013698630138e-05, "loss": 0.5267, "step": 22100 }, { "epoch": 1.0114155251141552, "grad_norm": 6.480524063110352, "learning_rate": 1.4828767123287672e-05, "loss": 0.583, "step": 22150 }, { "epoch": 1.0136986301369864, "grad_norm": 8.956890106201172, "learning_rate": 1.4794520547945205e-05, "loss": 0.838, "step": 22200 }, { "epoch": 1.0159817351598173, "grad_norm": 28.80997657775879, "learning_rate": 1.476027397260274e-05, "loss": 0.7791, "step": 22250 }, { "epoch": 1.0182648401826484, "grad_norm": 51.440643310546875, "learning_rate": 1.4726027397260275e-05, "loss": 0.6699, "step": 22300 }, { "epoch": 1.0205479452054795, "grad_norm": 16.62401580810547, "learning_rate": 1.469178082191781e-05, "loss": 0.6056, "step": 22350 }, { "epoch": 1.0228310502283104, "grad_norm": 6.103922367095947, "learning_rate": 1.4657534246575343e-05, "loss": 0.7163, "step": 22400 }, { "epoch": 1.0251141552511416, "grad_norm": 13.347216606140137, "learning_rate": 1.4623287671232877e-05, "loss": 0.8158, "step": 22450 }, { "epoch": 1.0273972602739727, "grad_norm": 10.218559265136719, "learning_rate": 1.4589041095890412e-05, "loss": 0.7067, "step": 22500 }, { "epoch": 1.0296803652968036, "grad_norm": 20.360658645629883, "learning_rate": 1.4554794520547945e-05, "loss": 0.5413, "step": 22550 }, { "epoch": 1.0319634703196348, "grad_norm": 14.813282012939453, "learning_rate": 1.4520547945205478e-05, "loss": 0.7121, "step": 22600 }, { "epoch": 1.0342465753424657, "grad_norm": 4.041282653808594, "learning_rate": 1.4486301369863013e-05, "loss": 0.6259, "step": 22650 }, { "epoch": 1.0365296803652968, "grad_norm": 8.718100547790527, "learning_rate": 1.4452054794520548e-05, "loss": 0.679, "step": 22700 }, { "epoch": 1.038812785388128, "grad_norm": 24.674726486206055, "learning_rate": 1.4417808219178082e-05, "loss": 0.5784, "step": 22750 }, { "epoch": 1.0410958904109588, "grad_norm": 18.543848037719727, "learning_rate": 1.4383561643835615e-05, "loss": 0.6259, "step": 22800 }, { "epoch": 1.04337899543379, "grad_norm": 11.60615348815918, "learning_rate": 1.434931506849315e-05, "loss": 0.6859, "step": 22850 }, { "epoch": 1.045662100456621, "grad_norm": 19.409120559692383, "learning_rate": 1.4315068493150685e-05, "loss": 0.724, "step": 22900 }, { "epoch": 1.047945205479452, "grad_norm": 18.087451934814453, "learning_rate": 1.428082191780822e-05, "loss": 0.6719, "step": 22950 }, { "epoch": 1.0502283105022832, "grad_norm": 37.11878967285156, "learning_rate": 1.4246575342465753e-05, "loss": 0.5994, "step": 23000 }, { "epoch": 1.052511415525114, "grad_norm": 35.999149322509766, "learning_rate": 1.4212328767123287e-05, "loss": 0.8059, "step": 23050 }, { "epoch": 1.0547945205479452, "grad_norm": 23.739049911499023, "learning_rate": 1.4178082191780822e-05, "loss": 0.4798, "step": 23100 }, { "epoch": 1.0570776255707763, "grad_norm": 4.680600166320801, "learning_rate": 1.4143835616438357e-05, "loss": 0.6557, "step": 23150 }, { "epoch": 1.0593607305936072, "grad_norm": 31.171585083007812, "learning_rate": 1.410958904109589e-05, "loss": 0.6122, "step": 23200 }, { "epoch": 1.0616438356164384, "grad_norm": 10.616646766662598, "learning_rate": 1.4075342465753425e-05, "loss": 0.7256, "step": 23250 }, { "epoch": 1.0639269406392695, "grad_norm": 5.910991191864014, "learning_rate": 1.404109589041096e-05, "loss": 0.5245, "step": 23300 }, { "epoch": 1.0662100456621004, "grad_norm": 6.659880638122559, "learning_rate": 1.4006849315068494e-05, "loss": 0.6245, "step": 23350 }, { "epoch": 1.0684931506849316, "grad_norm": 28.779626846313477, "learning_rate": 1.3972602739726027e-05, "loss": 0.7664, "step": 23400 }, { "epoch": 1.0707762557077625, "grad_norm": 42.86344909667969, "learning_rate": 1.3938356164383562e-05, "loss": 0.7317, "step": 23450 }, { "epoch": 1.0730593607305936, "grad_norm": 17.784263610839844, "learning_rate": 1.3904109589041097e-05, "loss": 0.7032, "step": 23500 }, { "epoch": 1.0753424657534247, "grad_norm": 33.24246597290039, "learning_rate": 1.3869863013698631e-05, "loss": 0.6477, "step": 23550 }, { "epoch": 1.0776255707762556, "grad_norm": 23.703388214111328, "learning_rate": 1.3835616438356164e-05, "loss": 0.5103, "step": 23600 }, { "epoch": 1.0799086757990868, "grad_norm": 27.350101470947266, "learning_rate": 1.38013698630137e-05, "loss": 0.7417, "step": 23650 }, { "epoch": 1.0821917808219177, "grad_norm": 32.2100715637207, "learning_rate": 1.3767123287671234e-05, "loss": 0.9106, "step": 23700 }, { "epoch": 1.0844748858447488, "grad_norm": 8.949542999267578, "learning_rate": 1.3732876712328769e-05, "loss": 0.6949, "step": 23750 }, { "epoch": 1.08675799086758, "grad_norm": 31.792240142822266, "learning_rate": 1.3698630136986302e-05, "loss": 0.725, "step": 23800 }, { "epoch": 1.0890410958904109, "grad_norm": 51.95265579223633, "learning_rate": 1.3664383561643835e-05, "loss": 0.73, "step": 23850 }, { "epoch": 1.091324200913242, "grad_norm": 19.113056182861328, "learning_rate": 1.363013698630137e-05, "loss": 0.6004, "step": 23900 }, { "epoch": 1.0936073059360731, "grad_norm": 16.85398292541504, "learning_rate": 1.3595890410958904e-05, "loss": 0.5843, "step": 23950 }, { "epoch": 1.095890410958904, "grad_norm": 20.591157913208008, "learning_rate": 1.3561643835616437e-05, "loss": 0.6938, "step": 24000 }, { "epoch": 1.0981735159817352, "grad_norm": 22.592805862426758, "learning_rate": 1.3527397260273972e-05, "loss": 0.6366, "step": 24050 }, { "epoch": 1.1004566210045663, "grad_norm": 77.25360870361328, "learning_rate": 1.3493150684931507e-05, "loss": 0.7245, "step": 24100 }, { "epoch": 1.1027397260273972, "grad_norm": 35.447452545166016, "learning_rate": 1.3458904109589042e-05, "loss": 0.7001, "step": 24150 }, { "epoch": 1.1050228310502284, "grad_norm": 50.135650634765625, "learning_rate": 1.3424657534246575e-05, "loss": 0.6473, "step": 24200 }, { "epoch": 1.1073059360730593, "grad_norm": 9.847333908081055, "learning_rate": 1.339041095890411e-05, "loss": 0.6206, "step": 24250 }, { "epoch": 1.1095890410958904, "grad_norm": 16.841964721679688, "learning_rate": 1.3356164383561644e-05, "loss": 0.754, "step": 24300 }, { "epoch": 1.1118721461187215, "grad_norm": 25.09712028503418, "learning_rate": 1.3321917808219179e-05, "loss": 0.7678, "step": 24350 }, { "epoch": 1.1141552511415524, "grad_norm": 20.83018684387207, "learning_rate": 1.3287671232876712e-05, "loss": 0.7375, "step": 24400 }, { "epoch": 1.1164383561643836, "grad_norm": 1.3051903247833252, "learning_rate": 1.3253424657534247e-05, "loss": 0.73, "step": 24450 }, { "epoch": 1.1187214611872145, "grad_norm": 21.558069229125977, "learning_rate": 1.3219178082191781e-05, "loss": 0.6535, "step": 24500 }, { "epoch": 1.1210045662100456, "grad_norm": 53.48995590209961, "learning_rate": 1.3184931506849316e-05, "loss": 0.7592, "step": 24550 }, { "epoch": 1.1232876712328768, "grad_norm": 6.8114166259765625, "learning_rate": 1.3150684931506849e-05, "loss": 0.6735, "step": 24600 }, { "epoch": 1.1255707762557077, "grad_norm": 16.559179306030273, "learning_rate": 1.3116438356164384e-05, "loss": 0.4556, "step": 24650 }, { "epoch": 1.1278538812785388, "grad_norm": 2.124957323074341, "learning_rate": 1.3082191780821919e-05, "loss": 0.92, "step": 24700 }, { "epoch": 1.13013698630137, "grad_norm": 34.67999267578125, "learning_rate": 1.3047945205479453e-05, "loss": 0.6309, "step": 24750 }, { "epoch": 1.1324200913242009, "grad_norm": 9.184309005737305, "learning_rate": 1.3013698630136986e-05, "loss": 0.7584, "step": 24800 }, { "epoch": 1.134703196347032, "grad_norm": 17.552547454833984, "learning_rate": 1.2979452054794521e-05, "loss": 0.6098, "step": 24850 }, { "epoch": 1.1369863013698631, "grad_norm": 37.24542999267578, "learning_rate": 1.2945205479452056e-05, "loss": 0.8336, "step": 24900 }, { "epoch": 1.139269406392694, "grad_norm": 52.120880126953125, "learning_rate": 1.291095890410959e-05, "loss": 0.6372, "step": 24950 }, { "epoch": 1.1415525114155252, "grad_norm": 13.773005485534668, "learning_rate": 1.2876712328767124e-05, "loss": 0.6734, "step": 25000 }, { "epoch": 1.143835616438356, "grad_norm": 12.726426124572754, "learning_rate": 1.2842465753424658e-05, "loss": 0.5249, "step": 25050 }, { "epoch": 1.1461187214611872, "grad_norm": 8.800257682800293, "learning_rate": 1.2808219178082193e-05, "loss": 0.5094, "step": 25100 }, { "epoch": 1.1484018264840183, "grad_norm": 77.74162292480469, "learning_rate": 1.2773972602739726e-05, "loss": 0.7281, "step": 25150 }, { "epoch": 1.1506849315068493, "grad_norm": 8.394207954406738, "learning_rate": 1.2739726027397259e-05, "loss": 0.615, "step": 25200 }, { "epoch": 1.1529680365296804, "grad_norm": 62.93704605102539, "learning_rate": 1.2705479452054794e-05, "loss": 0.5909, "step": 25250 }, { "epoch": 1.1552511415525113, "grad_norm": 18.461288452148438, "learning_rate": 1.2671232876712329e-05, "loss": 0.6578, "step": 25300 }, { "epoch": 1.1575342465753424, "grad_norm": 17.798723220825195, "learning_rate": 1.2636986301369863e-05, "loss": 0.7566, "step": 25350 }, { "epoch": 1.1598173515981736, "grad_norm": 0.32706037163734436, "learning_rate": 1.2602739726027396e-05, "loss": 0.6193, "step": 25400 }, { "epoch": 1.1621004566210045, "grad_norm": 19.030363082885742, "learning_rate": 1.2568493150684931e-05, "loss": 0.6249, "step": 25450 }, { "epoch": 1.1643835616438356, "grad_norm": 0.1700681447982788, "learning_rate": 1.2534246575342466e-05, "loss": 0.7782, "step": 25500 }, { "epoch": 1.1666666666666667, "grad_norm": 147.6623077392578, "learning_rate": 1.25e-05, "loss": 0.6307, "step": 25550 }, { "epoch": 1.1689497716894977, "grad_norm": 13.00705623626709, "learning_rate": 1.2465753424657534e-05, "loss": 0.6638, "step": 25600 }, { "epoch": 1.1712328767123288, "grad_norm": 10.626334190368652, "learning_rate": 1.2431506849315068e-05, "loss": 0.7118, "step": 25650 }, { "epoch": 1.17351598173516, "grad_norm": 5.709634304046631, "learning_rate": 1.2397260273972603e-05, "loss": 0.6299, "step": 25700 }, { "epoch": 1.1757990867579908, "grad_norm": 30.29009246826172, "learning_rate": 1.2363013698630138e-05, "loss": 0.7486, "step": 25750 }, { "epoch": 1.178082191780822, "grad_norm": 18.62900733947754, "learning_rate": 1.2328767123287671e-05, "loss": 0.5936, "step": 25800 }, { "epoch": 1.1803652968036529, "grad_norm": 14.033234596252441, "learning_rate": 1.2294520547945206e-05, "loss": 0.7004, "step": 25850 }, { "epoch": 1.182648401826484, "grad_norm": 38.063873291015625, "learning_rate": 1.226027397260274e-05, "loss": 0.7239, "step": 25900 }, { "epoch": 1.1849315068493151, "grad_norm": 40.94581985473633, "learning_rate": 1.2226027397260275e-05, "loss": 0.6232, "step": 25950 }, { "epoch": 1.187214611872146, "grad_norm": 10.695091247558594, "learning_rate": 1.2191780821917808e-05, "loss": 0.8526, "step": 26000 }, { "epoch": 1.1894977168949772, "grad_norm": 19.597412109375, "learning_rate": 1.2157534246575343e-05, "loss": 0.6878, "step": 26050 }, { "epoch": 1.191780821917808, "grad_norm": 57.820003509521484, "learning_rate": 1.2123287671232878e-05, "loss": 0.7689, "step": 26100 }, { "epoch": 1.1940639269406392, "grad_norm": 19.449691772460938, "learning_rate": 1.2089041095890412e-05, "loss": 0.6267, "step": 26150 }, { "epoch": 1.1963470319634704, "grad_norm": 30.782194137573242, "learning_rate": 1.2054794520547945e-05, "loss": 0.6165, "step": 26200 }, { "epoch": 1.1986301369863013, "grad_norm": 27.298837661743164, "learning_rate": 1.202054794520548e-05, "loss": 0.8057, "step": 26250 }, { "epoch": 1.2009132420091324, "grad_norm": 185.9980010986328, "learning_rate": 1.1986301369863015e-05, "loss": 0.7694, "step": 26300 }, { "epoch": 1.2031963470319635, "grad_norm": 3.1694886684417725, "learning_rate": 1.195205479452055e-05, "loss": 0.6598, "step": 26350 }, { "epoch": 1.2054794520547945, "grad_norm": 6.1167097091674805, "learning_rate": 1.1917808219178083e-05, "loss": 0.6464, "step": 26400 }, { "epoch": 1.2077625570776256, "grad_norm": 15.646038055419922, "learning_rate": 1.1883561643835616e-05, "loss": 0.9328, "step": 26450 }, { "epoch": 1.2100456621004567, "grad_norm": 7.978808403015137, "learning_rate": 1.184931506849315e-05, "loss": 0.6384, "step": 26500 }, { "epoch": 1.2123287671232876, "grad_norm": 21.067642211914062, "learning_rate": 1.1815068493150685e-05, "loss": 0.8629, "step": 26550 }, { "epoch": 1.2146118721461188, "grad_norm": 27.418428421020508, "learning_rate": 1.1780821917808218e-05, "loss": 0.6208, "step": 26600 }, { "epoch": 1.2168949771689497, "grad_norm": 10.672882080078125, "learning_rate": 1.1746575342465753e-05, "loss": 0.5777, "step": 26650 }, { "epoch": 1.2191780821917808, "grad_norm": 82.25353240966797, "learning_rate": 1.1712328767123288e-05, "loss": 0.9331, "step": 26700 }, { "epoch": 1.221461187214612, "grad_norm": 12.071101188659668, "learning_rate": 1.1678082191780822e-05, "loss": 0.7766, "step": 26750 }, { "epoch": 1.2237442922374429, "grad_norm": 23.268993377685547, "learning_rate": 1.1643835616438355e-05, "loss": 0.6875, "step": 26800 }, { "epoch": 1.226027397260274, "grad_norm": 39.18050003051758, "learning_rate": 1.160958904109589e-05, "loss": 0.7285, "step": 26850 }, { "epoch": 1.228310502283105, "grad_norm": 3.805318832397461, "learning_rate": 1.1575342465753425e-05, "loss": 0.6275, "step": 26900 }, { "epoch": 1.230593607305936, "grad_norm": 39.39872741699219, "learning_rate": 1.154109589041096e-05, "loss": 0.6889, "step": 26950 }, { "epoch": 1.2328767123287672, "grad_norm": 16.79233169555664, "learning_rate": 1.1506849315068493e-05, "loss": 0.5247, "step": 27000 }, { "epoch": 1.235159817351598, "grad_norm": 30.6214656829834, "learning_rate": 1.1472602739726027e-05, "loss": 0.5567, "step": 27050 }, { "epoch": 1.2374429223744292, "grad_norm": 5.248475551605225, "learning_rate": 1.1438356164383562e-05, "loss": 0.7133, "step": 27100 }, { "epoch": 1.2397260273972603, "grad_norm": 16.61432456970215, "learning_rate": 1.1404109589041097e-05, "loss": 0.6672, "step": 27150 }, { "epoch": 1.2420091324200913, "grad_norm": 16.011980056762695, "learning_rate": 1.136986301369863e-05, "loss": 0.7677, "step": 27200 }, { "epoch": 1.2442922374429224, "grad_norm": 34.770938873291016, "learning_rate": 1.1335616438356165e-05, "loss": 0.6866, "step": 27250 }, { "epoch": 1.2465753424657535, "grad_norm": 39.278892517089844, "learning_rate": 1.13013698630137e-05, "loss": 0.7042, "step": 27300 }, { "epoch": 1.2488584474885844, "grad_norm": 21.77368927001953, "learning_rate": 1.1267123287671234e-05, "loss": 0.6923, "step": 27350 }, { "epoch": 1.2511415525114156, "grad_norm": 8.305987358093262, "learning_rate": 1.1232876712328767e-05, "loss": 0.6222, "step": 27400 }, { "epoch": 1.2534246575342465, "grad_norm": 27.238414764404297, "learning_rate": 1.1198630136986302e-05, "loss": 0.6661, "step": 27450 }, { "epoch": 1.2557077625570776, "grad_norm": 25.92155647277832, "learning_rate": 1.1164383561643837e-05, "loss": 0.7408, "step": 27500 }, { "epoch": 1.2579908675799087, "grad_norm": 16.504167556762695, "learning_rate": 1.1130136986301371e-05, "loss": 0.5134, "step": 27550 }, { "epoch": 1.2602739726027397, "grad_norm": 33.571617126464844, "learning_rate": 1.1095890410958904e-05, "loss": 0.7309, "step": 27600 }, { "epoch": 1.2625570776255708, "grad_norm": 10.731171607971191, "learning_rate": 1.106164383561644e-05, "loss": 0.7008, "step": 27650 }, { "epoch": 1.2648401826484017, "grad_norm": 9.138223648071289, "learning_rate": 1.1027397260273972e-05, "loss": 0.7206, "step": 27700 }, { "epoch": 1.2671232876712328, "grad_norm": 19.61602210998535, "learning_rate": 1.0993150684931507e-05, "loss": 0.7169, "step": 27750 }, { "epoch": 1.269406392694064, "grad_norm": 25.721527099609375, "learning_rate": 1.095890410958904e-05, "loss": 0.6631, "step": 27800 }, { "epoch": 1.271689497716895, "grad_norm": 15.044477462768555, "learning_rate": 1.0924657534246575e-05, "loss": 0.6167, "step": 27850 }, { "epoch": 1.273972602739726, "grad_norm": 13.365096092224121, "learning_rate": 1.089041095890411e-05, "loss": 0.5629, "step": 27900 }, { "epoch": 1.2762557077625571, "grad_norm": 26.571229934692383, "learning_rate": 1.0856164383561644e-05, "loss": 0.6986, "step": 27950 }, { "epoch": 1.278538812785388, "grad_norm": 23.07392692565918, "learning_rate": 1.0821917808219177e-05, "loss": 0.6209, "step": 28000 }, { "epoch": 1.2808219178082192, "grad_norm": 1.8312214612960815, "learning_rate": 1.0787671232876712e-05, "loss": 0.7247, "step": 28050 }, { "epoch": 1.2831050228310503, "grad_norm": 3.6338565349578857, "learning_rate": 1.0753424657534247e-05, "loss": 0.896, "step": 28100 }, { "epoch": 1.2853881278538812, "grad_norm": 22.483678817749023, "learning_rate": 1.0719178082191782e-05, "loss": 0.7087, "step": 28150 }, { "epoch": 1.2876712328767124, "grad_norm": 29.26844024658203, "learning_rate": 1.0684931506849315e-05, "loss": 0.7095, "step": 28200 }, { "epoch": 1.2899543378995433, "grad_norm": 0.29415565729141235, "learning_rate": 1.065068493150685e-05, "loss": 0.5849, "step": 28250 }, { "epoch": 1.2922374429223744, "grad_norm": 0.7176687121391296, "learning_rate": 1.0616438356164384e-05, "loss": 0.657, "step": 28300 }, { "epoch": 1.2945205479452055, "grad_norm": 11.345915794372559, "learning_rate": 1.0582191780821919e-05, "loss": 0.5598, "step": 28350 }, { "epoch": 1.2968036529680365, "grad_norm": 35.81932067871094, "learning_rate": 1.0547945205479452e-05, "loss": 0.8267, "step": 28400 }, { "epoch": 1.2990867579908676, "grad_norm": 30.678194046020508, "learning_rate": 1.0513698630136987e-05, "loss": 0.6508, "step": 28450 }, { "epoch": 1.3013698630136985, "grad_norm": 3.55430006980896, "learning_rate": 1.0479452054794521e-05, "loss": 0.6641, "step": 28500 }, { "epoch": 1.3036529680365296, "grad_norm": 18.566883087158203, "learning_rate": 1.0445205479452056e-05, "loss": 0.7889, "step": 28550 }, { "epoch": 1.3059360730593608, "grad_norm": 19.019390106201172, "learning_rate": 1.0410958904109589e-05, "loss": 0.6177, "step": 28600 }, { "epoch": 1.308219178082192, "grad_norm": 17.26028823852539, "learning_rate": 1.0376712328767124e-05, "loss": 0.7045, "step": 28650 }, { "epoch": 1.3105022831050228, "grad_norm": 33.43575668334961, "learning_rate": 1.0342465753424659e-05, "loss": 0.6046, "step": 28700 }, { "epoch": 1.312785388127854, "grad_norm": 51.09929275512695, "learning_rate": 1.0308219178082193e-05, "loss": 0.5056, "step": 28750 }, { "epoch": 1.3150684931506849, "grad_norm": 17.530790328979492, "learning_rate": 1.0273972602739726e-05, "loss": 0.6507, "step": 28800 }, { "epoch": 1.317351598173516, "grad_norm": 26.142900466918945, "learning_rate": 1.0239726027397261e-05, "loss": 0.5859, "step": 28850 }, { "epoch": 1.3196347031963471, "grad_norm": 16.933874130249023, "learning_rate": 1.0205479452054796e-05, "loss": 0.7274, "step": 28900 }, { "epoch": 1.321917808219178, "grad_norm": 5.472013473510742, "learning_rate": 1.017123287671233e-05, "loss": 0.6445, "step": 28950 }, { "epoch": 1.3242009132420092, "grad_norm": 20.826509475708008, "learning_rate": 1.0136986301369862e-05, "loss": 0.7194, "step": 29000 }, { "epoch": 1.32648401826484, "grad_norm": 36.42399597167969, "learning_rate": 1.0102739726027397e-05, "loss": 0.6405, "step": 29050 }, { "epoch": 1.3287671232876712, "grad_norm": 17.444469451904297, "learning_rate": 1.0068493150684931e-05, "loss": 0.7348, "step": 29100 }, { "epoch": 1.3310502283105023, "grad_norm": 12.610512733459473, "learning_rate": 1.0034246575342466e-05, "loss": 0.6521, "step": 29150 }, { "epoch": 1.3333333333333333, "grad_norm": 14.737506866455078, "learning_rate": 9.999999999999999e-06, "loss": 0.6979, "step": 29200 }, { "epoch": 1.3356164383561644, "grad_norm": 27.808490753173828, "learning_rate": 9.965753424657534e-06, "loss": 0.7839, "step": 29250 }, { "epoch": 1.3378995433789953, "grad_norm": 7.368427753448486, "learning_rate": 9.931506849315069e-06, "loss": 0.6117, "step": 29300 }, { "epoch": 1.3401826484018264, "grad_norm": 27.834564208984375, "learning_rate": 9.897260273972603e-06, "loss": 0.6565, "step": 29350 }, { "epoch": 1.3424657534246576, "grad_norm": 33.823081970214844, "learning_rate": 9.863013698630136e-06, "loss": 0.6681, "step": 29400 }, { "epoch": 1.3447488584474887, "grad_norm": 26.18691635131836, "learning_rate": 9.828767123287671e-06, "loss": 0.9009, "step": 29450 }, { "epoch": 1.3470319634703196, "grad_norm": 20.92985725402832, "learning_rate": 9.794520547945206e-06, "loss": 0.5297, "step": 29500 }, { "epoch": 1.3493150684931507, "grad_norm": 4.244528770446777, "learning_rate": 9.76027397260274e-06, "loss": 0.6733, "step": 29550 }, { "epoch": 1.3515981735159817, "grad_norm": 67.45909118652344, "learning_rate": 9.726027397260274e-06, "loss": 0.575, "step": 29600 }, { "epoch": 1.3538812785388128, "grad_norm": 26.713943481445312, "learning_rate": 9.691780821917808e-06, "loss": 0.7428, "step": 29650 }, { "epoch": 1.356164383561644, "grad_norm": 8.588297843933105, "learning_rate": 9.657534246575343e-06, "loss": 0.5567, "step": 29700 }, { "epoch": 1.3584474885844748, "grad_norm": 22.027597427368164, "learning_rate": 9.623287671232878e-06, "loss": 0.7798, "step": 29750 }, { "epoch": 1.360730593607306, "grad_norm": 10.512967109680176, "learning_rate": 9.589041095890411e-06, "loss": 0.5646, "step": 29800 }, { "epoch": 1.3630136986301369, "grad_norm": 9.836832046508789, "learning_rate": 9.554794520547946e-06, "loss": 0.6904, "step": 29850 }, { "epoch": 1.365296803652968, "grad_norm": 74.73491668701172, "learning_rate": 9.52054794520548e-06, "loss": 0.7029, "step": 29900 }, { "epoch": 1.3675799086757991, "grad_norm": 36.467491149902344, "learning_rate": 9.486301369863015e-06, "loss": 0.7266, "step": 29950 }, { "epoch": 1.36986301369863, "grad_norm": 21.993133544921875, "learning_rate": 9.452054794520548e-06, "loss": 0.6316, "step": 30000 }, { "epoch": 1.3721461187214612, "grad_norm": 1.2455904483795166, "learning_rate": 9.417808219178083e-06, "loss": 0.5555, "step": 30050 }, { "epoch": 1.374429223744292, "grad_norm": 13.85091495513916, "learning_rate": 9.383561643835618e-06, "loss": 0.7729, "step": 30100 }, { "epoch": 1.3767123287671232, "grad_norm": 23.61090850830078, "learning_rate": 9.349315068493152e-06, "loss": 0.6255, "step": 30150 }, { "epoch": 1.3789954337899544, "grad_norm": 40.24787139892578, "learning_rate": 9.315068493150685e-06, "loss": 0.7646, "step": 30200 }, { "epoch": 1.3812785388127855, "grad_norm": 23.249425888061523, "learning_rate": 9.28082191780822e-06, "loss": 0.5965, "step": 30250 }, { "epoch": 1.3835616438356164, "grad_norm": 8.090579986572266, "learning_rate": 9.246575342465753e-06, "loss": 0.4478, "step": 30300 }, { "epoch": 1.3858447488584476, "grad_norm": 20.011905670166016, "learning_rate": 9.212328767123288e-06, "loss": 0.7777, "step": 30350 }, { "epoch": 1.3881278538812785, "grad_norm": 8.697684288024902, "learning_rate": 9.178082191780821e-06, "loss": 0.6304, "step": 30400 }, { "epoch": 1.3904109589041096, "grad_norm": 1.545689582824707, "learning_rate": 9.143835616438356e-06, "loss": 0.7816, "step": 30450 }, { "epoch": 1.3926940639269407, "grad_norm": 0.6150842308998108, "learning_rate": 9.10958904109589e-06, "loss": 0.5802, "step": 30500 }, { "epoch": 1.3949771689497716, "grad_norm": 21.5743350982666, "learning_rate": 9.075342465753425e-06, "loss": 0.6369, "step": 30550 }, { "epoch": 1.3972602739726028, "grad_norm": 10.7164306640625, "learning_rate": 9.041095890410958e-06, "loss": 0.4902, "step": 30600 }, { "epoch": 1.3995433789954337, "grad_norm": 28.598312377929688, "learning_rate": 9.006849315068493e-06, "loss": 0.613, "step": 30650 }, { "epoch": 1.4018264840182648, "grad_norm": 24.138431549072266, "learning_rate": 8.972602739726028e-06, "loss": 0.6153, "step": 30700 }, { "epoch": 1.404109589041096, "grad_norm": 22.55198860168457, "learning_rate": 8.938356164383562e-06, "loss": 0.5419, "step": 30750 }, { "epoch": 1.4063926940639269, "grad_norm": 26.992374420166016, "learning_rate": 8.904109589041095e-06, "loss": 0.5726, "step": 30800 }, { "epoch": 1.408675799086758, "grad_norm": 1.8640153408050537, "learning_rate": 8.86986301369863e-06, "loss": 0.6794, "step": 30850 }, { "epoch": 1.410958904109589, "grad_norm": 14.333259582519531, "learning_rate": 8.835616438356165e-06, "loss": 0.4972, "step": 30900 }, { "epoch": 1.41324200913242, "grad_norm": 64.34575653076172, "learning_rate": 8.8013698630137e-06, "loss": 0.9213, "step": 30950 }, { "epoch": 1.4155251141552512, "grad_norm": 9.60162353515625, "learning_rate": 8.767123287671233e-06, "loss": 0.6868, "step": 31000 }, { "epoch": 1.4178082191780823, "grad_norm": 8.155502319335938, "learning_rate": 8.732876712328767e-06, "loss": 0.6873, "step": 31050 }, { "epoch": 1.4200913242009132, "grad_norm": 5.085892677307129, "learning_rate": 8.698630136986302e-06, "loss": 0.7768, "step": 31100 }, { "epoch": 1.4223744292237444, "grad_norm": 52.48747634887695, "learning_rate": 8.664383561643837e-06, "loss": 0.687, "step": 31150 }, { "epoch": 1.4246575342465753, "grad_norm": 1.8491209745407104, "learning_rate": 8.63013698630137e-06, "loss": 0.5669, "step": 31200 }, { "epoch": 1.4269406392694064, "grad_norm": 12.143204689025879, "learning_rate": 8.595890410958905e-06, "loss": 0.5568, "step": 31250 }, { "epoch": 1.4292237442922375, "grad_norm": 2.939903974533081, "learning_rate": 8.56164383561644e-06, "loss": 0.7732, "step": 31300 }, { "epoch": 1.4315068493150684, "grad_norm": 17.773130416870117, "learning_rate": 8.527397260273974e-06, "loss": 0.7444, "step": 31350 }, { "epoch": 1.4337899543378996, "grad_norm": 47.27958679199219, "learning_rate": 8.493150684931507e-06, "loss": 0.6621, "step": 31400 }, { "epoch": 1.4360730593607305, "grad_norm": 50.40327453613281, "learning_rate": 8.458904109589042e-06, "loss": 0.84, "step": 31450 }, { "epoch": 1.4383561643835616, "grad_norm": 8.335402488708496, "learning_rate": 8.424657534246577e-06, "loss": 0.6762, "step": 31500 }, { "epoch": 1.4406392694063928, "grad_norm": 12.027316093444824, "learning_rate": 8.390410958904111e-06, "loss": 0.6736, "step": 31550 }, { "epoch": 1.4429223744292237, "grad_norm": 17.410192489624023, "learning_rate": 8.356164383561643e-06, "loss": 0.5072, "step": 31600 }, { "epoch": 1.4452054794520548, "grad_norm": 48.263450622558594, "learning_rate": 8.321917808219178e-06, "loss": 0.6268, "step": 31650 }, { "epoch": 1.4474885844748857, "grad_norm": 3.8568694591522217, "learning_rate": 8.287671232876712e-06, "loss": 0.5454, "step": 31700 }, { "epoch": 1.4497716894977168, "grad_norm": 13.764704704284668, "learning_rate": 8.253424657534247e-06, "loss": 0.6823, "step": 31750 }, { "epoch": 1.452054794520548, "grad_norm": 13.48620319366455, "learning_rate": 8.21917808219178e-06, "loss": 0.7103, "step": 31800 }, { "epoch": 1.454337899543379, "grad_norm": 17.291501998901367, "learning_rate": 8.184931506849315e-06, "loss": 0.7011, "step": 31850 }, { "epoch": 1.45662100456621, "grad_norm": 1.461418867111206, "learning_rate": 8.15068493150685e-06, "loss": 0.6667, "step": 31900 }, { "epoch": 1.4589041095890412, "grad_norm": 16.34942626953125, "learning_rate": 8.116438356164384e-06, "loss": 0.7885, "step": 31950 }, { "epoch": 1.461187214611872, "grad_norm": 14.74634075164795, "learning_rate": 8.082191780821917e-06, "loss": 0.644, "step": 32000 }, { "epoch": 1.4634703196347032, "grad_norm": 6.794888973236084, "learning_rate": 8.047945205479452e-06, "loss": 0.6738, "step": 32050 }, { "epoch": 1.4657534246575343, "grad_norm": 31.303226470947266, "learning_rate": 8.013698630136987e-06, "loss": 0.8235, "step": 32100 }, { "epoch": 1.4680365296803652, "grad_norm": 42.993648529052734, "learning_rate": 7.979452054794521e-06, "loss": 0.4712, "step": 32150 }, { "epoch": 1.4703196347031964, "grad_norm": 7.875132083892822, "learning_rate": 7.945205479452055e-06, "loss": 0.5951, "step": 32200 }, { "epoch": 1.4726027397260273, "grad_norm": 9.124963760375977, "learning_rate": 7.91095890410959e-06, "loss": 0.5968, "step": 32250 }, { "epoch": 1.4748858447488584, "grad_norm": 13.793811798095703, "learning_rate": 7.876712328767124e-06, "loss": 0.6484, "step": 32300 }, { "epoch": 1.4771689497716896, "grad_norm": 2.1718921661376953, "learning_rate": 7.842465753424659e-06, "loss": 0.6279, "step": 32350 }, { "epoch": 1.4794520547945205, "grad_norm": 60.621543884277344, "learning_rate": 7.808219178082192e-06, "loss": 0.6718, "step": 32400 }, { "epoch": 1.4817351598173516, "grad_norm": 6.748918533325195, "learning_rate": 7.773972602739727e-06, "loss": 0.6333, "step": 32450 }, { "epoch": 1.4840182648401825, "grad_norm": 10.061300277709961, "learning_rate": 7.739726027397261e-06, "loss": 0.5952, "step": 32500 }, { "epoch": 1.4863013698630136, "grad_norm": 55.56220245361328, "learning_rate": 7.705479452054796e-06, "loss": 0.6297, "step": 32550 }, { "epoch": 1.4885844748858448, "grad_norm": 31.07186508178711, "learning_rate": 7.671232876712329e-06, "loss": 0.5302, "step": 32600 }, { "epoch": 1.490867579908676, "grad_norm": 30.925626754760742, "learning_rate": 7.636986301369864e-06, "loss": 0.6608, "step": 32650 }, { "epoch": 1.4931506849315068, "grad_norm": 21.15188217163086, "learning_rate": 7.602739726027398e-06, "loss": 0.6838, "step": 32700 }, { "epoch": 1.495433789954338, "grad_norm": 15.808161735534668, "learning_rate": 7.568493150684932e-06, "loss": 0.5405, "step": 32750 }, { "epoch": 1.4977168949771689, "grad_norm": 11.866249084472656, "learning_rate": 7.5342465753424655e-06, "loss": 0.6924, "step": 32800 }, { "epoch": 1.5, "grad_norm": 29.02684783935547, "learning_rate": 7.5e-06, "loss": 0.6354, "step": 32850 }, { "epoch": 1.5022831050228311, "grad_norm": 20.26506996154785, "learning_rate": 7.465753424657534e-06, "loss": 0.6467, "step": 32900 }, { "epoch": 1.5045662100456623, "grad_norm": 23.63490867614746, "learning_rate": 7.431506849315069e-06, "loss": 0.6682, "step": 32950 }, { "epoch": 1.5068493150684932, "grad_norm": 16.075380325317383, "learning_rate": 7.397260273972603e-06, "loss": 0.6922, "step": 33000 }, { "epoch": 1.509132420091324, "grad_norm": 14.159255027770996, "learning_rate": 7.3630136986301374e-06, "loss": 0.6063, "step": 33050 }, { "epoch": 1.5114155251141552, "grad_norm": 22.143796920776367, "learning_rate": 7.328767123287671e-06, "loss": 0.7155, "step": 33100 }, { "epoch": 1.5136986301369864, "grad_norm": 98.22097778320312, "learning_rate": 7.294520547945206e-06, "loss": 0.7567, "step": 33150 }, { "epoch": 1.5159817351598175, "grad_norm": 0.7336256504058838, "learning_rate": 7.260273972602739e-06, "loss": 0.5414, "step": 33200 }, { "epoch": 1.5182648401826484, "grad_norm": 0.3773713707923889, "learning_rate": 7.226027397260274e-06, "loss": 0.5702, "step": 33250 }, { "epoch": 1.5205479452054793, "grad_norm": 8.909625053405762, "learning_rate": 7.191780821917808e-06, "loss": 0.7228, "step": 33300 }, { "epoch": 1.5228310502283104, "grad_norm": 21.098960876464844, "learning_rate": 7.1575342465753425e-06, "loss": 0.6001, "step": 33350 }, { "epoch": 1.5251141552511416, "grad_norm": 15.906450271606445, "learning_rate": 7.123287671232876e-06, "loss": 0.6303, "step": 33400 }, { "epoch": 1.5273972602739727, "grad_norm": 24.9348201751709, "learning_rate": 7.089041095890411e-06, "loss": 0.673, "step": 33450 }, { "epoch": 1.5296803652968036, "grad_norm": 8.255683898925781, "learning_rate": 7.054794520547945e-06, "loss": 0.5831, "step": 33500 }, { "epoch": 1.5319634703196348, "grad_norm": 73.46847534179688, "learning_rate": 7.02054794520548e-06, "loss": 0.7362, "step": 33550 }, { "epoch": 1.5342465753424657, "grad_norm": 88.85016632080078, "learning_rate": 6.986301369863014e-06, "loss": 0.6682, "step": 33600 }, { "epoch": 1.5365296803652968, "grad_norm": 52.53008270263672, "learning_rate": 6.952054794520548e-06, "loss": 0.5667, "step": 33650 }, { "epoch": 1.538812785388128, "grad_norm": 18.00398826599121, "learning_rate": 6.917808219178082e-06, "loss": 0.6214, "step": 33700 }, { "epoch": 1.541095890410959, "grad_norm": 27.124656677246094, "learning_rate": 6.883561643835617e-06, "loss": 0.6737, "step": 33750 }, { "epoch": 1.54337899543379, "grad_norm": 39.45083999633789, "learning_rate": 6.849315068493151e-06, "loss": 0.703, "step": 33800 }, { "epoch": 1.545662100456621, "grad_norm": 0.20495979487895966, "learning_rate": 6.815068493150685e-06, "loss": 0.6271, "step": 33850 }, { "epoch": 1.547945205479452, "grad_norm": 0.8208453059196472, "learning_rate": 6.780821917808219e-06, "loss": 0.6804, "step": 33900 }, { "epoch": 1.5502283105022832, "grad_norm": 12.416110038757324, "learning_rate": 6.746575342465753e-06, "loss": 0.7786, "step": 33950 }, { "epoch": 1.5525114155251143, "grad_norm": 21.69839096069336, "learning_rate": 6.712328767123287e-06, "loss": 0.7048, "step": 34000 }, { "epoch": 1.5547945205479452, "grad_norm": 57.30881881713867, "learning_rate": 6.678082191780822e-06, "loss": 0.7398, "step": 34050 }, { "epoch": 1.5570776255707761, "grad_norm": 22.58492088317871, "learning_rate": 6.643835616438356e-06, "loss": 0.7027, "step": 34100 }, { "epoch": 1.5593607305936072, "grad_norm": 8.803092002868652, "learning_rate": 6.609589041095891e-06, "loss": 0.7296, "step": 34150 }, { "epoch": 1.5616438356164384, "grad_norm": 18.931156158447266, "learning_rate": 6.5753424657534245e-06, "loss": 0.7622, "step": 34200 }, { "epoch": 1.5639269406392695, "grad_norm": 8.858073234558105, "learning_rate": 6.541095890410959e-06, "loss": 0.615, "step": 34250 }, { "epoch": 1.5662100456621004, "grad_norm": 6.284381866455078, "learning_rate": 6.506849315068493e-06, "loss": 0.7147, "step": 34300 }, { "epoch": 1.5684931506849316, "grad_norm": 21.08570098876953, "learning_rate": 6.472602739726028e-06, "loss": 0.6153, "step": 34350 }, { "epoch": 1.5707762557077625, "grad_norm": 2.0850419998168945, "learning_rate": 6.438356164383562e-06, "loss": 0.588, "step": 34400 }, { "epoch": 1.5730593607305936, "grad_norm": 0.25530076026916504, "learning_rate": 6.4041095890410965e-06, "loss": 0.7249, "step": 34450 }, { "epoch": 1.5753424657534247, "grad_norm": 31.284807205200195, "learning_rate": 6.3698630136986296e-06, "loss": 0.6106, "step": 34500 }, { "epoch": 1.5776255707762559, "grad_norm": 19.524412155151367, "learning_rate": 6.335616438356164e-06, "loss": 0.7259, "step": 34550 }, { "epoch": 1.5799086757990868, "grad_norm": 6.005446910858154, "learning_rate": 6.301369863013698e-06, "loss": 0.5231, "step": 34600 }, { "epoch": 1.5821917808219177, "grad_norm": 17.577402114868164, "learning_rate": 6.267123287671233e-06, "loss": 0.847, "step": 34650 }, { "epoch": 1.5844748858447488, "grad_norm": 31.817855834960938, "learning_rate": 6.232876712328767e-06, "loss": 0.5509, "step": 34700 }, { "epoch": 1.58675799086758, "grad_norm": 4.6908063888549805, "learning_rate": 6.1986301369863016e-06, "loss": 0.752, "step": 34750 }, { "epoch": 1.589041095890411, "grad_norm": 2.8228561878204346, "learning_rate": 6.1643835616438354e-06, "loss": 0.7156, "step": 34800 }, { "epoch": 1.591324200913242, "grad_norm": 7.878891468048096, "learning_rate": 6.13013698630137e-06, "loss": 0.6926, "step": 34850 }, { "epoch": 1.593607305936073, "grad_norm": 30.530006408691406, "learning_rate": 6.095890410958904e-06, "loss": 0.7552, "step": 34900 }, { "epoch": 1.595890410958904, "grad_norm": 29.396806716918945, "learning_rate": 6.061643835616439e-06, "loss": 0.7008, "step": 34950 }, { "epoch": 1.5981735159817352, "grad_norm": 10.500929832458496, "learning_rate": 6.027397260273973e-06, "loss": 0.4952, "step": 35000 }, { "epoch": 1.6004566210045663, "grad_norm": 2.337519407272339, "learning_rate": 5.9931506849315074e-06, "loss": 0.5864, "step": 35050 }, { "epoch": 1.6027397260273972, "grad_norm": 8.646376609802246, "learning_rate": 5.958904109589041e-06, "loss": 0.7454, "step": 35100 }, { "epoch": 1.6050228310502284, "grad_norm": 17.153099060058594, "learning_rate": 5.924657534246575e-06, "loss": 0.6943, "step": 35150 }, { "epoch": 1.6073059360730593, "grad_norm": 25.350088119506836, "learning_rate": 5.890410958904109e-06, "loss": 0.6803, "step": 35200 }, { "epoch": 1.6095890410958904, "grad_norm": 4.12929105758667, "learning_rate": 5.856164383561644e-06, "loss": 0.6976, "step": 35250 }, { "epoch": 1.6118721461187215, "grad_norm": 14.61955451965332, "learning_rate": 5.821917808219178e-06, "loss": 0.7487, "step": 35300 }, { "epoch": 1.6141552511415527, "grad_norm": 6.208589553833008, "learning_rate": 5.7876712328767125e-06, "loss": 0.6569, "step": 35350 }, { "epoch": 1.6164383561643836, "grad_norm": 9.5521240234375, "learning_rate": 5.753424657534246e-06, "loss": 0.6505, "step": 35400 }, { "epoch": 1.6187214611872145, "grad_norm": 14.391396522521973, "learning_rate": 5.719178082191781e-06, "loss": 0.5558, "step": 35450 }, { "epoch": 1.6210045662100456, "grad_norm": 1.2627131938934326, "learning_rate": 5.684931506849315e-06, "loss": 0.6405, "step": 35500 }, { "epoch": 1.6232876712328768, "grad_norm": 83.7956314086914, "learning_rate": 5.65068493150685e-06, "loss": 0.5483, "step": 35550 }, { "epoch": 1.625570776255708, "grad_norm": 15.18497085571289, "learning_rate": 5.616438356164384e-06, "loss": 0.6692, "step": 35600 }, { "epoch": 1.6278538812785388, "grad_norm": 33.16044998168945, "learning_rate": 5.582191780821918e-06, "loss": 0.6939, "step": 35650 }, { "epoch": 1.6301369863013697, "grad_norm": 12.063103675842285, "learning_rate": 5.547945205479452e-06, "loss": 0.5414, "step": 35700 }, { "epoch": 1.6324200913242009, "grad_norm": 26.803749084472656, "learning_rate": 5.513698630136986e-06, "loss": 0.5353, "step": 35750 }, { "epoch": 1.634703196347032, "grad_norm": 6.5856523513793945, "learning_rate": 5.47945205479452e-06, "loss": 0.7678, "step": 35800 }, { "epoch": 1.6369863013698631, "grad_norm": 14.661989212036133, "learning_rate": 5.445205479452055e-06, "loss": 0.67, "step": 35850 }, { "epoch": 1.639269406392694, "grad_norm": 2.4577255249023438, "learning_rate": 5.410958904109589e-06, "loss": 0.4579, "step": 35900 }, { "epoch": 1.6415525114155252, "grad_norm": 17.849546432495117, "learning_rate": 5.376712328767123e-06, "loss": 0.7221, "step": 35950 }, { "epoch": 1.643835616438356, "grad_norm": 6.453017234802246, "learning_rate": 5.342465753424657e-06, "loss": 0.6889, "step": 36000 }, { "epoch": 1.6461187214611872, "grad_norm": 15.138044357299805, "learning_rate": 5.308219178082192e-06, "loss": 0.6234, "step": 36050 }, { "epoch": 1.6484018264840183, "grad_norm": 11.393730163574219, "learning_rate": 5.273972602739726e-06, "loss": 0.6353, "step": 36100 }, { "epoch": 1.6506849315068495, "grad_norm": 2.726991891860962, "learning_rate": 5.239726027397261e-06, "loss": 0.5305, "step": 36150 }, { "epoch": 1.6529680365296804, "grad_norm": 14.54566478729248, "learning_rate": 5.2054794520547945e-06, "loss": 0.8252, "step": 36200 }, { "epoch": 1.6552511415525113, "grad_norm": 0.952422022819519, "learning_rate": 5.171232876712329e-06, "loss": 0.6773, "step": 36250 }, { "epoch": 1.6575342465753424, "grad_norm": 21.44168472290039, "learning_rate": 5.136986301369863e-06, "loss": 0.6106, "step": 36300 }, { "epoch": 1.6598173515981736, "grad_norm": 11.663095474243164, "learning_rate": 5.102739726027398e-06, "loss": 0.6514, "step": 36350 }, { "epoch": 1.6621004566210047, "grad_norm": 23.589557647705078, "learning_rate": 5.068493150684931e-06, "loss": 0.708, "step": 36400 }, { "epoch": 1.6643835616438356, "grad_norm": 10.283199310302734, "learning_rate": 5.034246575342466e-06, "loss": 0.5563, "step": 36450 }, { "epoch": 1.6666666666666665, "grad_norm": 44.115047454833984, "learning_rate": 4.9999999999999996e-06, "loss": 0.5912, "step": 36500 }, { "epoch": 1.6689497716894977, "grad_norm": 71.8247299194336, "learning_rate": 4.965753424657534e-06, "loss": 0.7993, "step": 36550 }, { "epoch": 1.6712328767123288, "grad_norm": 11.808229446411133, "learning_rate": 4.931506849315068e-06, "loss": 0.5851, "step": 36600 }, { "epoch": 1.67351598173516, "grad_norm": 74.73955535888672, "learning_rate": 4.897260273972603e-06, "loss": 0.7004, "step": 36650 }, { "epoch": 1.6757990867579908, "grad_norm": 36.06229019165039, "learning_rate": 4.863013698630137e-06, "loss": 0.7382, "step": 36700 }, { "epoch": 1.678082191780822, "grad_norm": 53.29566955566406, "learning_rate": 4.8287671232876716e-06, "loss": 0.5035, "step": 36750 }, { "epoch": 1.6803652968036529, "grad_norm": 21.9272403717041, "learning_rate": 4.7945205479452054e-06, "loss": 0.5309, "step": 36800 }, { "epoch": 1.682648401826484, "grad_norm": 39.56712341308594, "learning_rate": 4.76027397260274e-06, "loss": 0.4754, "step": 36850 }, { "epoch": 1.6849315068493151, "grad_norm": 2.814680576324463, "learning_rate": 4.726027397260274e-06, "loss": 0.7033, "step": 36900 }, { "epoch": 1.6872146118721463, "grad_norm": 95.83110809326172, "learning_rate": 4.691780821917809e-06, "loss": 0.6999, "step": 36950 }, { "epoch": 1.6894977168949772, "grad_norm": 27.638185501098633, "learning_rate": 4.657534246575343e-06, "loss": 0.6522, "step": 37000 }, { "epoch": 1.691780821917808, "grad_norm": 10.899153709411621, "learning_rate": 4.623287671232877e-06, "loss": 0.6386, "step": 37050 }, { "epoch": 1.6940639269406392, "grad_norm": 21.410276412963867, "learning_rate": 4.5890410958904105e-06, "loss": 0.695, "step": 37100 }, { "epoch": 1.6963470319634704, "grad_norm": 15.208582878112793, "learning_rate": 4.554794520547945e-06, "loss": 0.7636, "step": 37150 }, { "epoch": 1.6986301369863015, "grad_norm": 19.083850860595703, "learning_rate": 4.520547945205479e-06, "loss": 0.6331, "step": 37200 }, { "epoch": 1.7009132420091324, "grad_norm": 4.408557415008545, "learning_rate": 4.486301369863014e-06, "loss": 0.6886, "step": 37250 }, { "epoch": 1.7031963470319633, "grad_norm": 10.206310272216797, "learning_rate": 4.452054794520548e-06, "loss": 0.639, "step": 37300 }, { "epoch": 1.7054794520547945, "grad_norm": 18.985891342163086, "learning_rate": 4.4178082191780825e-06, "loss": 0.6884, "step": 37350 }, { "epoch": 1.7077625570776256, "grad_norm": 16.533288955688477, "learning_rate": 4.383561643835616e-06, "loss": 0.6446, "step": 37400 }, { "epoch": 1.7100456621004567, "grad_norm": 25.728469848632812, "learning_rate": 4.349315068493151e-06, "loss": 0.4863, "step": 37450 }, { "epoch": 1.7123287671232876, "grad_norm": 88.2020492553711, "learning_rate": 4.315068493150685e-06, "loss": 0.6831, "step": 37500 }, { "epoch": 1.7146118721461188, "grad_norm": 44.737815856933594, "learning_rate": 4.28082191780822e-06, "loss": 0.4687, "step": 37550 }, { "epoch": 1.7168949771689497, "grad_norm": 1.9043503999710083, "learning_rate": 4.246575342465754e-06, "loss": 0.6608, "step": 37600 }, { "epoch": 1.7191780821917808, "grad_norm": 11.180625915527344, "learning_rate": 4.212328767123288e-06, "loss": 0.7672, "step": 37650 }, { "epoch": 1.721461187214612, "grad_norm": 3.448392391204834, "learning_rate": 4.178082191780821e-06, "loss": 0.6176, "step": 37700 }, { "epoch": 1.723744292237443, "grad_norm": 10.672887802124023, "learning_rate": 4.143835616438356e-06, "loss": 0.5263, "step": 37750 }, { "epoch": 1.726027397260274, "grad_norm": 15.69261360168457, "learning_rate": 4.10958904109589e-06, "loss": 0.4637, "step": 37800 }, { "epoch": 1.728310502283105, "grad_norm": 19.786346435546875, "learning_rate": 4.075342465753425e-06, "loss": 0.4554, "step": 37850 }, { "epoch": 1.730593607305936, "grad_norm": 31.991483688354492, "learning_rate": 4.041095890410959e-06, "loss": 0.561, "step": 37900 }, { "epoch": 1.7328767123287672, "grad_norm": 21.96062469482422, "learning_rate": 4.006849315068493e-06, "loss": 0.5969, "step": 37950 }, { "epoch": 1.7351598173515983, "grad_norm": 10.800865173339844, "learning_rate": 3.972602739726027e-06, "loss": 0.6058, "step": 38000 }, { "epoch": 1.7374429223744292, "grad_norm": 10.979826927185059, "learning_rate": 3.938356164383562e-06, "loss": 0.6034, "step": 38050 }, { "epoch": 1.7397260273972601, "grad_norm": 41.3328742980957, "learning_rate": 3.904109589041096e-06, "loss": 0.5687, "step": 38100 }, { "epoch": 1.7420091324200913, "grad_norm": 38.379608154296875, "learning_rate": 3.869863013698631e-06, "loss": 0.6931, "step": 38150 }, { "epoch": 1.7442922374429224, "grad_norm": 3.292733907699585, "learning_rate": 3.8356164383561645e-06, "loss": 0.4162, "step": 38200 }, { "epoch": 1.7465753424657535, "grad_norm": 18.47883415222168, "learning_rate": 3.801369863013699e-06, "loss": 0.6574, "step": 38250 }, { "epoch": 1.7488584474885844, "grad_norm": 10.917158126831055, "learning_rate": 3.7671232876712327e-06, "loss": 0.6617, "step": 38300 }, { "epoch": 1.7511415525114156, "grad_norm": 13.783547401428223, "learning_rate": 3.732876712328767e-06, "loss": 0.7701, "step": 38350 }, { "epoch": 1.7534246575342465, "grad_norm": 21.937267303466797, "learning_rate": 3.6986301369863014e-06, "loss": 0.7627, "step": 38400 }, { "epoch": 1.7557077625570776, "grad_norm": 15.421838760375977, "learning_rate": 3.6643835616438357e-06, "loss": 0.6636, "step": 38450 }, { "epoch": 1.7579908675799087, "grad_norm": 14.788371086120605, "learning_rate": 3.6301369863013696e-06, "loss": 0.5593, "step": 38500 }, { "epoch": 1.7602739726027399, "grad_norm": 5.76630163192749, "learning_rate": 3.595890410958904e-06, "loss": 0.5112, "step": 38550 }, { "epoch": 1.7625570776255708, "grad_norm": 23.72429656982422, "learning_rate": 3.561643835616438e-06, "loss": 0.5729, "step": 38600 }, { "epoch": 1.7648401826484017, "grad_norm": 18.512802124023438, "learning_rate": 3.5273972602739725e-06, "loss": 0.6827, "step": 38650 }, { "epoch": 1.7671232876712328, "grad_norm": 4.7244720458984375, "learning_rate": 3.493150684931507e-06, "loss": 0.6428, "step": 38700 }, { "epoch": 1.769406392694064, "grad_norm": 6.208735466003418, "learning_rate": 3.458904109589041e-06, "loss": 0.5986, "step": 38750 }, { "epoch": 1.771689497716895, "grad_norm": 2.6915433406829834, "learning_rate": 3.4246575342465754e-06, "loss": 0.5905, "step": 38800 }, { "epoch": 1.773972602739726, "grad_norm": 4.2726969718933105, "learning_rate": 3.3904109589041093e-06, "loss": 0.564, "step": 38850 }, { "epoch": 1.776255707762557, "grad_norm": 26.4520263671875, "learning_rate": 3.3561643835616436e-06, "loss": 0.5738, "step": 38900 }, { "epoch": 1.778538812785388, "grad_norm": 43.63593673706055, "learning_rate": 3.321917808219178e-06, "loss": 0.621, "step": 38950 }, { "epoch": 1.7808219178082192, "grad_norm": 4.847127437591553, "learning_rate": 3.2876712328767123e-06, "loss": 0.8252, "step": 39000 }, { "epoch": 1.7831050228310503, "grad_norm": 7.686138153076172, "learning_rate": 3.2534246575342466e-06, "loss": 0.7211, "step": 39050 }, { "epoch": 1.7853881278538812, "grad_norm": 42.92139434814453, "learning_rate": 3.219178082191781e-06, "loss": 0.7652, "step": 39100 }, { "epoch": 1.7876712328767124, "grad_norm": 57.05276107788086, "learning_rate": 3.1849315068493148e-06, "loss": 0.5234, "step": 39150 }, { "epoch": 1.7899543378995433, "grad_norm": 18.587209701538086, "learning_rate": 3.150684931506849e-06, "loss": 0.457, "step": 39200 }, { "epoch": 1.7922374429223744, "grad_norm": 7.19858455657959, "learning_rate": 3.1164383561643834e-06, "loss": 0.8795, "step": 39250 }, { "epoch": 1.7945205479452055, "grad_norm": 10.892264366149902, "learning_rate": 3.0821917808219177e-06, "loss": 0.7042, "step": 39300 }, { "epoch": 1.7968036529680367, "grad_norm": 28.82424545288086, "learning_rate": 3.047945205479452e-06, "loss": 0.6396, "step": 39350 }, { "epoch": 1.7990867579908676, "grad_norm": 7.087406158447266, "learning_rate": 3.0136986301369864e-06, "loss": 0.6665, "step": 39400 }, { "epoch": 1.8013698630136985, "grad_norm": 22.56847381591797, "learning_rate": 2.9794520547945207e-06, "loss": 0.7265, "step": 39450 }, { "epoch": 1.8036529680365296, "grad_norm": 18.845949172973633, "learning_rate": 2.9452054794520546e-06, "loss": 0.6475, "step": 39500 }, { "epoch": 1.8059360730593608, "grad_norm": 26.794076919555664, "learning_rate": 2.910958904109589e-06, "loss": 0.7632, "step": 39550 }, { "epoch": 1.808219178082192, "grad_norm": 0.44524723291397095, "learning_rate": 2.876712328767123e-06, "loss": 0.4843, "step": 39600 }, { "epoch": 1.8105022831050228, "grad_norm": 45.64598083496094, "learning_rate": 2.8424657534246575e-06, "loss": 0.5918, "step": 39650 }, { "epoch": 1.8127853881278537, "grad_norm": 75.41986846923828, "learning_rate": 2.808219178082192e-06, "loss": 0.6352, "step": 39700 }, { "epoch": 1.8150684931506849, "grad_norm": 10.345170974731445, "learning_rate": 2.773972602739726e-06, "loss": 0.6204, "step": 39750 }, { "epoch": 1.817351598173516, "grad_norm": 11.58834171295166, "learning_rate": 2.73972602739726e-06, "loss": 0.5325, "step": 39800 }, { "epoch": 1.8196347031963471, "grad_norm": 14.26885986328125, "learning_rate": 2.7054794520547943e-06, "loss": 0.5008, "step": 39850 }, { "epoch": 1.821917808219178, "grad_norm": 4.937170505523682, "learning_rate": 2.6712328767123286e-06, "loss": 0.7317, "step": 39900 }, { "epoch": 1.8242009132420092, "grad_norm": 27.849742889404297, "learning_rate": 2.636986301369863e-06, "loss": 0.6825, "step": 39950 }, { "epoch": 1.82648401826484, "grad_norm": 38.649810791015625, "learning_rate": 2.6027397260273973e-06, "loss": 0.4422, "step": 40000 }, { "epoch": 1.8287671232876712, "grad_norm": 52.95954895019531, "learning_rate": 2.5684931506849316e-06, "loss": 0.747, "step": 40050 }, { "epoch": 1.8310502283105023, "grad_norm": 10.486088752746582, "learning_rate": 2.5342465753424655e-06, "loss": 0.6757, "step": 40100 }, { "epoch": 1.8333333333333335, "grad_norm": 0.9142507314682007, "learning_rate": 2.4999999999999998e-06, "loss": 0.712, "step": 40150 }, { "epoch": 1.8356164383561644, "grad_norm": 51.62909698486328, "learning_rate": 2.465753424657534e-06, "loss": 0.6107, "step": 40200 }, { "epoch": 1.8378995433789953, "grad_norm": 31.60240936279297, "learning_rate": 2.4315068493150684e-06, "loss": 0.601, "step": 40250 }, { "epoch": 1.8401826484018264, "grad_norm": 55.16636657714844, "learning_rate": 2.3972602739726027e-06, "loss": 0.6121, "step": 40300 }, { "epoch": 1.8424657534246576, "grad_norm": 34.450416564941406, "learning_rate": 2.363013698630137e-06, "loss": 0.6331, "step": 40350 }, { "epoch": 1.8447488584474887, "grad_norm": 27.42693328857422, "learning_rate": 2.3287671232876713e-06, "loss": 0.733, "step": 40400 }, { "epoch": 1.8470319634703196, "grad_norm": 44.26624298095703, "learning_rate": 2.2945205479452052e-06, "loss": 0.7453, "step": 40450 }, { "epoch": 1.8493150684931505, "grad_norm": 16.07997703552246, "learning_rate": 2.2602739726027396e-06, "loss": 0.6628, "step": 40500 }, { "epoch": 1.8515981735159817, "grad_norm": 2.816776752471924, "learning_rate": 2.226027397260274e-06, "loss": 0.4991, "step": 40550 }, { "epoch": 1.8538812785388128, "grad_norm": 4.0401434898376465, "learning_rate": 2.191780821917808e-06, "loss": 0.6943, "step": 40600 }, { "epoch": 1.856164383561644, "grad_norm": 5.525669097900391, "learning_rate": 2.1575342465753425e-06, "loss": 0.6887, "step": 40650 }, { "epoch": 1.8584474885844748, "grad_norm": 85.989990234375, "learning_rate": 2.123287671232877e-06, "loss": 0.7425, "step": 40700 }, { "epoch": 1.860730593607306, "grad_norm": 4.5465779304504395, "learning_rate": 2.0890410958904107e-06, "loss": 0.7462, "step": 40750 }, { "epoch": 1.8630136986301369, "grad_norm": 5.920977592468262, "learning_rate": 2.054794520547945e-06, "loss": 0.5546, "step": 40800 }, { "epoch": 1.865296803652968, "grad_norm": 12.154388427734375, "learning_rate": 2.0205479452054793e-06, "loss": 0.6693, "step": 40850 }, { "epoch": 1.8675799086757991, "grad_norm": 16.800073623657227, "learning_rate": 1.9863013698630136e-06, "loss": 0.7908, "step": 40900 }, { "epoch": 1.8698630136986303, "grad_norm": 43.42325973510742, "learning_rate": 1.952054794520548e-06, "loss": 0.5155, "step": 40950 }, { "epoch": 1.8721461187214612, "grad_norm": 29.5067138671875, "learning_rate": 1.9178082191780823e-06, "loss": 0.7588, "step": 41000 }, { "epoch": 1.874429223744292, "grad_norm": 28.01750946044922, "learning_rate": 1.8835616438356164e-06, "loss": 0.6622, "step": 41050 }, { "epoch": 1.8767123287671232, "grad_norm": 16.869781494140625, "learning_rate": 1.8493150684931507e-06, "loss": 0.6737, "step": 41100 }, { "epoch": 1.8789954337899544, "grad_norm": 8.377634048461914, "learning_rate": 1.8150684931506848e-06, "loss": 0.6281, "step": 41150 }, { "epoch": 1.8812785388127855, "grad_norm": 16.61414337158203, "learning_rate": 1.780821917808219e-06, "loss": 0.6251, "step": 41200 }, { "epoch": 1.8835616438356164, "grad_norm": 16.144508361816406, "learning_rate": 1.7465753424657534e-06, "loss": 0.6607, "step": 41250 }, { "epoch": 1.8858447488584473, "grad_norm": 20.15201759338379, "learning_rate": 1.7123287671232877e-06, "loss": 0.5207, "step": 41300 }, { "epoch": 1.8881278538812785, "grad_norm": 7.15456485748291, "learning_rate": 1.6780821917808218e-06, "loss": 0.5882, "step": 41350 }, { "epoch": 1.8904109589041096, "grad_norm": 17.336624145507812, "learning_rate": 1.6438356164383561e-06, "loss": 0.556, "step": 41400 }, { "epoch": 1.8926940639269407, "grad_norm": 20.451026916503906, "learning_rate": 1.6095890410958904e-06, "loss": 0.6011, "step": 41450 }, { "epoch": 1.8949771689497716, "grad_norm": 33.44941329956055, "learning_rate": 1.5753424657534245e-06, "loss": 0.5166, "step": 41500 }, { "epoch": 1.8972602739726028, "grad_norm": 24.176786422729492, "learning_rate": 1.5410958904109589e-06, "loss": 0.6837, "step": 41550 }, { "epoch": 1.8995433789954337, "grad_norm": 17.142606735229492, "learning_rate": 1.5068493150684932e-06, "loss": 0.5813, "step": 41600 }, { "epoch": 1.9018264840182648, "grad_norm": 34.20349884033203, "learning_rate": 1.4726027397260273e-06, "loss": 0.786, "step": 41650 }, { "epoch": 1.904109589041096, "grad_norm": 1.5305472612380981, "learning_rate": 1.4383561643835616e-06, "loss": 0.5928, "step": 41700 }, { "epoch": 1.906392694063927, "grad_norm": 1.1295257806777954, "learning_rate": 1.404109589041096e-06, "loss": 0.7055, "step": 41750 }, { "epoch": 1.908675799086758, "grad_norm": 23.80326271057129, "learning_rate": 1.36986301369863e-06, "loss": 0.5861, "step": 41800 }, { "epoch": 1.910958904109589, "grad_norm": 3.346529960632324, "learning_rate": 1.3356164383561643e-06, "loss": 0.6864, "step": 41850 }, { "epoch": 1.91324200913242, "grad_norm": 34.07392883300781, "learning_rate": 1.3013698630136986e-06, "loss": 0.6754, "step": 41900 }, { "epoch": 1.9155251141552512, "grad_norm": 42.87485122680664, "learning_rate": 1.2671232876712327e-06, "loss": 0.613, "step": 41950 }, { "epoch": 1.9178082191780823, "grad_norm": 9.337113380432129, "learning_rate": 1.232876712328767e-06, "loss": 0.5302, "step": 42000 }, { "epoch": 1.9200913242009132, "grad_norm": 19.920682907104492, "learning_rate": 1.1986301369863014e-06, "loss": 0.4405, "step": 42050 }, { "epoch": 1.9223744292237441, "grad_norm": 24.49388313293457, "learning_rate": 1.1643835616438357e-06, "loss": 0.696, "step": 42100 }, { "epoch": 1.9246575342465753, "grad_norm": 7.732158184051514, "learning_rate": 1.1301369863013698e-06, "loss": 0.758, "step": 42150 }, { "epoch": 1.9269406392694064, "grad_norm": 6.940062046051025, "learning_rate": 1.095890410958904e-06, "loss": 0.6804, "step": 42200 }, { "epoch": 1.9292237442922375, "grad_norm": 1.062066674232483, "learning_rate": 1.0616438356164384e-06, "loss": 0.6059, "step": 42250 }, { "epoch": 1.9315068493150684, "grad_norm": 20.240144729614258, "learning_rate": 1.0273972602739725e-06, "loss": 0.7992, "step": 42300 }, { "epoch": 1.9337899543378996, "grad_norm": 19.4890193939209, "learning_rate": 9.931506849315068e-07, "loss": 0.6943, "step": 42350 }, { "epoch": 1.9360730593607305, "grad_norm": 25.273487091064453, "learning_rate": 9.589041095890411e-07, "loss": 0.5925, "step": 42400 }, { "epoch": 1.9383561643835616, "grad_norm": 21.916284561157227, "learning_rate": 9.246575342465753e-07, "loss": 0.6607, "step": 42450 }, { "epoch": 1.9406392694063928, "grad_norm": 38.107566833496094, "learning_rate": 8.904109589041095e-07, "loss": 0.5807, "step": 42500 }, { "epoch": 1.9429223744292239, "grad_norm": 8.979408264160156, "learning_rate": 8.561643835616439e-07, "loss": 0.6391, "step": 42550 }, { "epoch": 1.9452054794520548, "grad_norm": 20.871389389038086, "learning_rate": 8.219178082191781e-07, "loss": 0.764, "step": 42600 }, { "epoch": 1.9474885844748857, "grad_norm": 19.994056701660156, "learning_rate": 7.876712328767123e-07, "loss": 0.7959, "step": 42650 }, { "epoch": 1.9497716894977168, "grad_norm": 25.47404670715332, "learning_rate": 7.534246575342466e-07, "loss": 0.5579, "step": 42700 }, { "epoch": 1.952054794520548, "grad_norm": 10.598165512084961, "learning_rate": 7.191780821917808e-07, "loss": 0.8362, "step": 42750 }, { "epoch": 1.954337899543379, "grad_norm": 1.3319069147109985, "learning_rate": 6.84931506849315e-07, "loss": 0.8132, "step": 42800 }, { "epoch": 1.95662100456621, "grad_norm": 4.061497211456299, "learning_rate": 6.506849315068493e-07, "loss": 0.5364, "step": 42850 }, { "epoch": 1.958904109589041, "grad_norm": 3.6196768283843994, "learning_rate": 6.164383561643835e-07, "loss": 0.57, "step": 42900 }, { "epoch": 1.961187214611872, "grad_norm": 26.95933723449707, "learning_rate": 5.821917808219178e-07, "loss": 0.7364, "step": 42950 }, { "epoch": 1.9634703196347032, "grad_norm": 25.18138313293457, "learning_rate": 5.47945205479452e-07, "loss": 0.5801, "step": 43000 }, { "epoch": 1.9657534246575343, "grad_norm": 77.7193374633789, "learning_rate": 5.136986301369863e-07, "loss": 0.6965, "step": 43050 }, { "epoch": 1.9680365296803652, "grad_norm": 10.553460121154785, "learning_rate": 4.794520547945206e-07, "loss": 0.6019, "step": 43100 }, { "epoch": 1.9703196347031964, "grad_norm": 14.633034706115723, "learning_rate": 4.4520547945205477e-07, "loss": 0.5948, "step": 43150 }, { "epoch": 1.9726027397260273, "grad_norm": 10.625903129577637, "learning_rate": 4.1095890410958903e-07, "loss": 0.5868, "step": 43200 }, { "epoch": 1.9748858447488584, "grad_norm": 11.625406265258789, "learning_rate": 3.767123287671233e-07, "loss": 0.7335, "step": 43250 }, { "epoch": 1.9771689497716896, "grad_norm": 11.875858306884766, "learning_rate": 3.424657534246575e-07, "loss": 0.6305, "step": 43300 }, { "epoch": 1.9794520547945207, "grad_norm": 65.91475677490234, "learning_rate": 3.0821917808219176e-07, "loss": 0.5038, "step": 43350 }, { "epoch": 1.9817351598173516, "grad_norm": 7.164722442626953, "learning_rate": 2.73972602739726e-07, "loss": 0.6426, "step": 43400 }, { "epoch": 1.9840182648401825, "grad_norm": 19.229677200317383, "learning_rate": 2.397260273972603e-07, "loss": 0.7671, "step": 43450 }, { "epoch": 1.9863013698630136, "grad_norm": 9.592227935791016, "learning_rate": 2.0547945205479452e-07, "loss": 0.7229, "step": 43500 }, { "epoch": 1.9885844748858448, "grad_norm": 23.81863021850586, "learning_rate": 1.7123287671232875e-07, "loss": 0.5238, "step": 43550 }, { "epoch": 1.990867579908676, "grad_norm": 30.481460571289062, "learning_rate": 1.36986301369863e-07, "loss": 0.5882, "step": 43600 }, { "epoch": 1.9931506849315068, "grad_norm": 3.194218397140503, "learning_rate": 1.0273972602739726e-07, "loss": 0.5594, "step": 43650 }, { "epoch": 1.9954337899543377, "grad_norm": 32.75349807739258, "learning_rate": 6.84931506849315e-08, "loss": 0.7056, "step": 43700 }, { "epoch": 1.9977168949771689, "grad_norm": 37.13774490356445, "learning_rate": 3.424657534246575e-08, "loss": 0.7017, "step": 43750 }, { "epoch": 2.0, "grad_norm": 38.73429489135742, "learning_rate": 0.0, "loss": 0.6852, "step": 43800 }, { "epoch": 2.0, "step": 43800, "total_flos": 3.864501390676132e+17, "train_loss": 0.05644623499482734, "train_runtime": 4615.1991, "train_samples_per_second": 37.961, "train_steps_per_second": 9.49 } ], "logging_steps": 50, "max_steps": 43800, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.864501390676132e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }