radic2682's picture
Model save
84c4875 verified
raw
history blame
154 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 43800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00228310502283105,
"grad_norm": 33.85030746459961,
"learning_rate": 2.9965753424657534e-05,
"loss": 4.7256,
"step": 50
},
{
"epoch": 0.0045662100456621,
"grad_norm": 28.36233139038086,
"learning_rate": 2.993150684931507e-05,
"loss": 4.0093,
"step": 100
},
{
"epoch": 0.00684931506849315,
"grad_norm": 40.79988098144531,
"learning_rate": 2.9897260273972603e-05,
"loss": 3.3789,
"step": 150
},
{
"epoch": 0.0091324200913242,
"grad_norm": 38.37752914428711,
"learning_rate": 2.9863013698630136e-05,
"loss": 2.5531,
"step": 200
},
{
"epoch": 0.01141552511415525,
"grad_norm": 46.2108268737793,
"learning_rate": 2.9828767123287673e-05,
"loss": 2.4412,
"step": 250
},
{
"epoch": 0.0136986301369863,
"grad_norm": 22.16556739807129,
"learning_rate": 2.9794520547945206e-05,
"loss": 2.1968,
"step": 300
},
{
"epoch": 0.01598173515981735,
"grad_norm": 38.14749526977539,
"learning_rate": 2.9760273972602742e-05,
"loss": 2.0913,
"step": 350
},
{
"epoch": 0.0182648401826484,
"grad_norm": 38.938079833984375,
"learning_rate": 2.9726027397260275e-05,
"loss": 2.0564,
"step": 400
},
{
"epoch": 0.02054794520547945,
"grad_norm": 51.680259704589844,
"learning_rate": 2.969178082191781e-05,
"loss": 1.9041,
"step": 450
},
{
"epoch": 0.0228310502283105,
"grad_norm": 23.21047019958496,
"learning_rate": 2.9657534246575345e-05,
"loss": 1.7974,
"step": 500
},
{
"epoch": 0.02511415525114155,
"grad_norm": 27.936336517333984,
"learning_rate": 2.9623287671232878e-05,
"loss": 1.6824,
"step": 550
},
{
"epoch": 0.0273972602739726,
"grad_norm": 121.84676361083984,
"learning_rate": 2.958904109589041e-05,
"loss": 2.0853,
"step": 600
},
{
"epoch": 0.02968036529680365,
"grad_norm": 51.72910690307617,
"learning_rate": 2.9554794520547947e-05,
"loss": 1.8569,
"step": 650
},
{
"epoch": 0.0319634703196347,
"grad_norm": 27.697223663330078,
"learning_rate": 2.952054794520548e-05,
"loss": 1.7805,
"step": 700
},
{
"epoch": 0.03424657534246575,
"grad_norm": 59.87869644165039,
"learning_rate": 2.9486301369863017e-05,
"loss": 1.8421,
"step": 750
},
{
"epoch": 0.0365296803652968,
"grad_norm": 20.744386672973633,
"learning_rate": 2.945205479452055e-05,
"loss": 1.8149,
"step": 800
},
{
"epoch": 0.03881278538812785,
"grad_norm": 22.272050857543945,
"learning_rate": 2.9417808219178083e-05,
"loss": 1.6892,
"step": 850
},
{
"epoch": 0.0410958904109589,
"grad_norm": 34.34733963012695,
"learning_rate": 2.938356164383562e-05,
"loss": 1.6013,
"step": 900
},
{
"epoch": 0.04337899543378995,
"grad_norm": 19.482553482055664,
"learning_rate": 2.9349315068493152e-05,
"loss": 1.6772,
"step": 950
},
{
"epoch": 0.045662100456621,
"grad_norm": 19.61932945251465,
"learning_rate": 2.9315068493150685e-05,
"loss": 1.7372,
"step": 1000
},
{
"epoch": 0.04794520547945205,
"grad_norm": 47.28583526611328,
"learning_rate": 2.9280821917808222e-05,
"loss": 1.6459,
"step": 1050
},
{
"epoch": 0.0502283105022831,
"grad_norm": 74.12429809570312,
"learning_rate": 2.9246575342465755e-05,
"loss": 1.4242,
"step": 1100
},
{
"epoch": 0.05251141552511415,
"grad_norm": 28.49104118347168,
"learning_rate": 2.921232876712329e-05,
"loss": 1.4958,
"step": 1150
},
{
"epoch": 0.0547945205479452,
"grad_norm": 69.21525573730469,
"learning_rate": 2.9178082191780824e-05,
"loss": 1.4848,
"step": 1200
},
{
"epoch": 0.05707762557077625,
"grad_norm": 28.827421188354492,
"learning_rate": 2.9143835616438357e-05,
"loss": 1.451,
"step": 1250
},
{
"epoch": 0.0593607305936073,
"grad_norm": 69.60295867919922,
"learning_rate": 2.910958904109589e-05,
"loss": 1.6166,
"step": 1300
},
{
"epoch": 0.06164383561643835,
"grad_norm": 43.529075622558594,
"learning_rate": 2.9075342465753423e-05,
"loss": 1.5593,
"step": 1350
},
{
"epoch": 0.0639269406392694,
"grad_norm": 21.382102966308594,
"learning_rate": 2.9041095890410956e-05,
"loss": 1.5083,
"step": 1400
},
{
"epoch": 0.06621004566210045,
"grad_norm": 38.85325622558594,
"learning_rate": 2.9006849315068493e-05,
"loss": 1.6731,
"step": 1450
},
{
"epoch": 0.0684931506849315,
"grad_norm": 31.006072998046875,
"learning_rate": 2.8972602739726026e-05,
"loss": 1.4695,
"step": 1500
},
{
"epoch": 0.07077625570776255,
"grad_norm": 26.75164222717285,
"learning_rate": 2.8938356164383562e-05,
"loss": 1.574,
"step": 1550
},
{
"epoch": 0.0730593607305936,
"grad_norm": 38.76344299316406,
"learning_rate": 2.8904109589041095e-05,
"loss": 1.5674,
"step": 1600
},
{
"epoch": 0.07534246575342465,
"grad_norm": 34.11776351928711,
"learning_rate": 2.886986301369863e-05,
"loss": 1.3804,
"step": 1650
},
{
"epoch": 0.0776255707762557,
"grad_norm": 45.00776290893555,
"learning_rate": 2.8835616438356165e-05,
"loss": 1.5021,
"step": 1700
},
{
"epoch": 0.07990867579908675,
"grad_norm": 85.93338775634766,
"learning_rate": 2.8801369863013698e-05,
"loss": 1.4673,
"step": 1750
},
{
"epoch": 0.0821917808219178,
"grad_norm": 20.37348747253418,
"learning_rate": 2.876712328767123e-05,
"loss": 1.5315,
"step": 1800
},
{
"epoch": 0.08447488584474885,
"grad_norm": 18.73834800720215,
"learning_rate": 2.8732876712328767e-05,
"loss": 1.4074,
"step": 1850
},
{
"epoch": 0.0867579908675799,
"grad_norm": 38.19795608520508,
"learning_rate": 2.86986301369863e-05,
"loss": 1.4442,
"step": 1900
},
{
"epoch": 0.08904109589041095,
"grad_norm": 18.31586265563965,
"learning_rate": 2.8664383561643837e-05,
"loss": 1.4918,
"step": 1950
},
{
"epoch": 0.091324200913242,
"grad_norm": 44.28574752807617,
"learning_rate": 2.863013698630137e-05,
"loss": 1.3071,
"step": 2000
},
{
"epoch": 0.09360730593607305,
"grad_norm": 51.20829391479492,
"learning_rate": 2.8595890410958903e-05,
"loss": 1.6107,
"step": 2050
},
{
"epoch": 0.0958904109589041,
"grad_norm": 45.16651153564453,
"learning_rate": 2.856164383561644e-05,
"loss": 1.1947,
"step": 2100
},
{
"epoch": 0.09817351598173515,
"grad_norm": 3.6301872730255127,
"learning_rate": 2.8527397260273972e-05,
"loss": 1.4133,
"step": 2150
},
{
"epoch": 0.1004566210045662,
"grad_norm": 19.277009963989258,
"learning_rate": 2.8493150684931505e-05,
"loss": 1.2874,
"step": 2200
},
{
"epoch": 0.10273972602739725,
"grad_norm": 36.330936431884766,
"learning_rate": 2.8458904109589042e-05,
"loss": 1.4955,
"step": 2250
},
{
"epoch": 0.1050228310502283,
"grad_norm": 15.057435035705566,
"learning_rate": 2.8424657534246575e-05,
"loss": 1.3309,
"step": 2300
},
{
"epoch": 0.10730593607305935,
"grad_norm": 31.65122413635254,
"learning_rate": 2.839041095890411e-05,
"loss": 1.3836,
"step": 2350
},
{
"epoch": 0.1095890410958904,
"grad_norm": 26.276851654052734,
"learning_rate": 2.8356164383561644e-05,
"loss": 1.4872,
"step": 2400
},
{
"epoch": 0.11187214611872145,
"grad_norm": 80.19303131103516,
"learning_rate": 2.8321917808219177e-05,
"loss": 1.4708,
"step": 2450
},
{
"epoch": 0.1141552511415525,
"grad_norm": 23.674320220947266,
"learning_rate": 2.8287671232876714e-05,
"loss": 1.1475,
"step": 2500
},
{
"epoch": 0.11643835616438356,
"grad_norm": 29.5810604095459,
"learning_rate": 2.8253424657534247e-05,
"loss": 1.5903,
"step": 2550
},
{
"epoch": 0.1187214611872146,
"grad_norm": 32.49135208129883,
"learning_rate": 2.821917808219178e-05,
"loss": 1.3248,
"step": 2600
},
{
"epoch": 0.12100456621004566,
"grad_norm": 33.26783752441406,
"learning_rate": 2.8184931506849316e-05,
"loss": 1.5587,
"step": 2650
},
{
"epoch": 0.1232876712328767,
"grad_norm": 34.50761795043945,
"learning_rate": 2.815068493150685e-05,
"loss": 1.3713,
"step": 2700
},
{
"epoch": 0.12557077625570776,
"grad_norm": 21.493181228637695,
"learning_rate": 2.8116438356164386e-05,
"loss": 1.2665,
"step": 2750
},
{
"epoch": 0.1278538812785388,
"grad_norm": 10.42421817779541,
"learning_rate": 2.808219178082192e-05,
"loss": 1.3406,
"step": 2800
},
{
"epoch": 0.13013698630136986,
"grad_norm": 18.451990127563477,
"learning_rate": 2.8047945205479452e-05,
"loss": 1.2502,
"step": 2850
},
{
"epoch": 0.1324200913242009,
"grad_norm": 27.949146270751953,
"learning_rate": 2.801369863013699e-05,
"loss": 1.2492,
"step": 2900
},
{
"epoch": 0.13470319634703196,
"grad_norm": 19.943056106567383,
"learning_rate": 2.797945205479452e-05,
"loss": 1.4071,
"step": 2950
},
{
"epoch": 0.136986301369863,
"grad_norm": 30.464204788208008,
"learning_rate": 2.7945205479452054e-05,
"loss": 1.2135,
"step": 3000
},
{
"epoch": 0.13926940639269406,
"grad_norm": 20.832355499267578,
"learning_rate": 2.791095890410959e-05,
"loss": 1.2872,
"step": 3050
},
{
"epoch": 0.1415525114155251,
"grad_norm": 13.37592601776123,
"learning_rate": 2.7876712328767124e-05,
"loss": 1.1165,
"step": 3100
},
{
"epoch": 0.14383561643835616,
"grad_norm": 36.62568664550781,
"learning_rate": 2.784246575342466e-05,
"loss": 1.2849,
"step": 3150
},
{
"epoch": 0.1461187214611872,
"grad_norm": 11.884871482849121,
"learning_rate": 2.7808219178082193e-05,
"loss": 1.241,
"step": 3200
},
{
"epoch": 0.14840182648401826,
"grad_norm": 22.552181243896484,
"learning_rate": 2.7773972602739726e-05,
"loss": 1.1695,
"step": 3250
},
{
"epoch": 0.1506849315068493,
"grad_norm": 29.05255126953125,
"learning_rate": 2.7739726027397263e-05,
"loss": 1.3266,
"step": 3300
},
{
"epoch": 0.15296803652968036,
"grad_norm": 27.039731979370117,
"learning_rate": 2.7705479452054796e-05,
"loss": 1.4528,
"step": 3350
},
{
"epoch": 0.1552511415525114,
"grad_norm": 12.077860832214355,
"learning_rate": 2.767123287671233e-05,
"loss": 1.0587,
"step": 3400
},
{
"epoch": 0.15753424657534246,
"grad_norm": 33.64236068725586,
"learning_rate": 2.7636986301369865e-05,
"loss": 1.3053,
"step": 3450
},
{
"epoch": 0.1598173515981735,
"grad_norm": 7.781187534332275,
"learning_rate": 2.76027397260274e-05,
"loss": 1.5692,
"step": 3500
},
{
"epoch": 0.16210045662100456,
"grad_norm": 58.28102111816406,
"learning_rate": 2.7568493150684935e-05,
"loss": 1.0579,
"step": 3550
},
{
"epoch": 0.1643835616438356,
"grad_norm": 46.713260650634766,
"learning_rate": 2.7534246575342468e-05,
"loss": 1.1327,
"step": 3600
},
{
"epoch": 0.16666666666666666,
"grad_norm": 38.657493591308594,
"learning_rate": 2.75e-05,
"loss": 1.299,
"step": 3650
},
{
"epoch": 0.1689497716894977,
"grad_norm": 7.117087364196777,
"learning_rate": 2.7465753424657537e-05,
"loss": 1.0367,
"step": 3700
},
{
"epoch": 0.17123287671232876,
"grad_norm": 33.10891342163086,
"learning_rate": 2.743150684931507e-05,
"loss": 1.1318,
"step": 3750
},
{
"epoch": 0.1735159817351598,
"grad_norm": 38.68271255493164,
"learning_rate": 2.7397260273972603e-05,
"loss": 1.1906,
"step": 3800
},
{
"epoch": 0.17579908675799086,
"grad_norm": 45.01628494262695,
"learning_rate": 2.736301369863014e-05,
"loss": 1.1285,
"step": 3850
},
{
"epoch": 0.1780821917808219,
"grad_norm": 20.761018753051758,
"learning_rate": 2.732876712328767e-05,
"loss": 1.4117,
"step": 3900
},
{
"epoch": 0.18036529680365296,
"grad_norm": 46.919952392578125,
"learning_rate": 2.7294520547945206e-05,
"loss": 1.0591,
"step": 3950
},
{
"epoch": 0.182648401826484,
"grad_norm": 78.8492431640625,
"learning_rate": 2.726027397260274e-05,
"loss": 1.3741,
"step": 4000
},
{
"epoch": 0.18493150684931506,
"grad_norm": 25.57372283935547,
"learning_rate": 2.7226027397260272e-05,
"loss": 1.1287,
"step": 4050
},
{
"epoch": 0.1872146118721461,
"grad_norm": 16.47197723388672,
"learning_rate": 2.719178082191781e-05,
"loss": 1.3216,
"step": 4100
},
{
"epoch": 0.18949771689497716,
"grad_norm": 36.24203109741211,
"learning_rate": 2.715753424657534e-05,
"loss": 1.3095,
"step": 4150
},
{
"epoch": 0.1917808219178082,
"grad_norm": 8.002535820007324,
"learning_rate": 2.7123287671232875e-05,
"loss": 1.1426,
"step": 4200
},
{
"epoch": 0.19406392694063926,
"grad_norm": 9.112885475158691,
"learning_rate": 2.708904109589041e-05,
"loss": 1.4292,
"step": 4250
},
{
"epoch": 0.1963470319634703,
"grad_norm": 7.080036640167236,
"learning_rate": 2.7054794520547944e-05,
"loss": 1.2665,
"step": 4300
},
{
"epoch": 0.19863013698630136,
"grad_norm": 8.403115272521973,
"learning_rate": 2.702054794520548e-05,
"loss": 1.3686,
"step": 4350
},
{
"epoch": 0.2009132420091324,
"grad_norm": 29.843015670776367,
"learning_rate": 2.6986301369863014e-05,
"loss": 1.0842,
"step": 4400
},
{
"epoch": 0.20319634703196346,
"grad_norm": 16.962310791015625,
"learning_rate": 2.6952054794520547e-05,
"loss": 1.1983,
"step": 4450
},
{
"epoch": 0.2054794520547945,
"grad_norm": 19.98363494873047,
"learning_rate": 2.6917808219178083e-05,
"loss": 1.136,
"step": 4500
},
{
"epoch": 0.20776255707762556,
"grad_norm": 23.507699966430664,
"learning_rate": 2.6883561643835616e-05,
"loss": 1.4167,
"step": 4550
},
{
"epoch": 0.2100456621004566,
"grad_norm": 85.11144256591797,
"learning_rate": 2.684931506849315e-05,
"loss": 1.0667,
"step": 4600
},
{
"epoch": 0.21232876712328766,
"grad_norm": 25.13855743408203,
"learning_rate": 2.6815068493150686e-05,
"loss": 1.1755,
"step": 4650
},
{
"epoch": 0.2146118721461187,
"grad_norm": 21.816020965576172,
"learning_rate": 2.678082191780822e-05,
"loss": 1.1357,
"step": 4700
},
{
"epoch": 0.21689497716894976,
"grad_norm": 27.655017852783203,
"learning_rate": 2.6746575342465755e-05,
"loss": 1.2578,
"step": 4750
},
{
"epoch": 0.2191780821917808,
"grad_norm": 24.329933166503906,
"learning_rate": 2.6712328767123288e-05,
"loss": 1.1243,
"step": 4800
},
{
"epoch": 0.22146118721461186,
"grad_norm": 9.320807456970215,
"learning_rate": 2.667808219178082e-05,
"loss": 1.2211,
"step": 4850
},
{
"epoch": 0.2237442922374429,
"grad_norm": 14.245413780212402,
"learning_rate": 2.6643835616438358e-05,
"loss": 1.2804,
"step": 4900
},
{
"epoch": 0.22602739726027396,
"grad_norm": 46.82653045654297,
"learning_rate": 2.660958904109589e-05,
"loss": 1.2949,
"step": 4950
},
{
"epoch": 0.228310502283105,
"grad_norm": 43.06188201904297,
"learning_rate": 2.6575342465753424e-05,
"loss": 1.2012,
"step": 5000
},
{
"epoch": 0.23059360730593606,
"grad_norm": 50.084381103515625,
"learning_rate": 2.654109589041096e-05,
"loss": 1.2965,
"step": 5050
},
{
"epoch": 0.2328767123287671,
"grad_norm": 32.71940994262695,
"learning_rate": 2.6506849315068493e-05,
"loss": 1.0847,
"step": 5100
},
{
"epoch": 0.23515981735159816,
"grad_norm": 53.56361389160156,
"learning_rate": 2.647260273972603e-05,
"loss": 1.2125,
"step": 5150
},
{
"epoch": 0.2374429223744292,
"grad_norm": 57.06380081176758,
"learning_rate": 2.6438356164383563e-05,
"loss": 1.06,
"step": 5200
},
{
"epoch": 0.23972602739726026,
"grad_norm": 41.62690734863281,
"learning_rate": 2.6404109589041096e-05,
"loss": 1.1048,
"step": 5250
},
{
"epoch": 0.2420091324200913,
"grad_norm": 44.06789016723633,
"learning_rate": 2.6369863013698632e-05,
"loss": 1.2219,
"step": 5300
},
{
"epoch": 0.24429223744292236,
"grad_norm": 14.321037292480469,
"learning_rate": 2.6335616438356165e-05,
"loss": 1.2137,
"step": 5350
},
{
"epoch": 0.2465753424657534,
"grad_norm": 31.366552352905273,
"learning_rate": 2.6301369863013698e-05,
"loss": 1.2281,
"step": 5400
},
{
"epoch": 0.24885844748858446,
"grad_norm": 19.642141342163086,
"learning_rate": 2.6267123287671235e-05,
"loss": 1.0646,
"step": 5450
},
{
"epoch": 0.2511415525114155,
"grad_norm": 24.77635955810547,
"learning_rate": 2.6232876712328768e-05,
"loss": 1.0827,
"step": 5500
},
{
"epoch": 0.2534246575342466,
"grad_norm": 21.257320404052734,
"learning_rate": 2.6198630136986304e-05,
"loss": 1.1798,
"step": 5550
},
{
"epoch": 0.2557077625570776,
"grad_norm": 7.442146301269531,
"learning_rate": 2.6164383561643837e-05,
"loss": 1.2409,
"step": 5600
},
{
"epoch": 0.2579908675799087,
"grad_norm": 40.682579040527344,
"learning_rate": 2.613013698630137e-05,
"loss": 1.2001,
"step": 5650
},
{
"epoch": 0.2602739726027397,
"grad_norm": 22.984914779663086,
"learning_rate": 2.6095890410958907e-05,
"loss": 1.0922,
"step": 5700
},
{
"epoch": 0.2625570776255708,
"grad_norm": 14.088912010192871,
"learning_rate": 2.606164383561644e-05,
"loss": 1.0361,
"step": 5750
},
{
"epoch": 0.2648401826484018,
"grad_norm": 26.859743118286133,
"learning_rate": 2.6027397260273973e-05,
"loss": 1.3059,
"step": 5800
},
{
"epoch": 0.2671232876712329,
"grad_norm": 27.274446487426758,
"learning_rate": 2.599315068493151e-05,
"loss": 1.2146,
"step": 5850
},
{
"epoch": 0.2694063926940639,
"grad_norm": 15.877359390258789,
"learning_rate": 2.5958904109589042e-05,
"loss": 1.047,
"step": 5900
},
{
"epoch": 0.271689497716895,
"grad_norm": 27.11214256286621,
"learning_rate": 2.592465753424658e-05,
"loss": 1.1303,
"step": 5950
},
{
"epoch": 0.273972602739726,
"grad_norm": 25.463417053222656,
"learning_rate": 2.589041095890411e-05,
"loss": 0.9916,
"step": 6000
},
{
"epoch": 0.2762557077625571,
"grad_norm": 5.419765472412109,
"learning_rate": 2.5856164383561645e-05,
"loss": 1.3228,
"step": 6050
},
{
"epoch": 0.2785388127853881,
"grad_norm": 26.771987915039062,
"learning_rate": 2.582191780821918e-05,
"loss": 1.1163,
"step": 6100
},
{
"epoch": 0.2808219178082192,
"grad_norm": 40.04314422607422,
"learning_rate": 2.5787671232876714e-05,
"loss": 1.2227,
"step": 6150
},
{
"epoch": 0.2831050228310502,
"grad_norm": 31.262903213500977,
"learning_rate": 2.5753424657534247e-05,
"loss": 1.0956,
"step": 6200
},
{
"epoch": 0.2853881278538813,
"grad_norm": 8.211708068847656,
"learning_rate": 2.5719178082191784e-05,
"loss": 0.8686,
"step": 6250
},
{
"epoch": 0.2876712328767123,
"grad_norm": 7.24912166595459,
"learning_rate": 2.5684931506849317e-05,
"loss": 1.2297,
"step": 6300
},
{
"epoch": 0.2899543378995434,
"grad_norm": 11.469908714294434,
"learning_rate": 2.5650684931506853e-05,
"loss": 1.1037,
"step": 6350
},
{
"epoch": 0.2922374429223744,
"grad_norm": 11.30005168914795,
"learning_rate": 2.5616438356164386e-05,
"loss": 1.1043,
"step": 6400
},
{
"epoch": 0.2945205479452055,
"grad_norm": 9.771041870117188,
"learning_rate": 2.558219178082192e-05,
"loss": 1.09,
"step": 6450
},
{
"epoch": 0.2968036529680365,
"grad_norm": 22.025493621826172,
"learning_rate": 2.5547945205479452e-05,
"loss": 1.1252,
"step": 6500
},
{
"epoch": 0.2990867579908676,
"grad_norm": 22.842763900756836,
"learning_rate": 2.5513698630136985e-05,
"loss": 1.0219,
"step": 6550
},
{
"epoch": 0.3013698630136986,
"grad_norm": 49.29541015625,
"learning_rate": 2.5479452054794518e-05,
"loss": 0.8537,
"step": 6600
},
{
"epoch": 0.3036529680365297,
"grad_norm": 32.22005844116211,
"learning_rate": 2.5445205479452055e-05,
"loss": 1.1211,
"step": 6650
},
{
"epoch": 0.3059360730593607,
"grad_norm": 49.50102615356445,
"learning_rate": 2.5410958904109588e-05,
"loss": 1.0462,
"step": 6700
},
{
"epoch": 0.3082191780821918,
"grad_norm": 69.23637390136719,
"learning_rate": 2.5376712328767124e-05,
"loss": 0.9964,
"step": 6750
},
{
"epoch": 0.3105022831050228,
"grad_norm": 15.762770652770996,
"learning_rate": 2.5342465753424657e-05,
"loss": 1.3032,
"step": 6800
},
{
"epoch": 0.3127853881278539,
"grad_norm": 36.22660827636719,
"learning_rate": 2.530821917808219e-05,
"loss": 1.1776,
"step": 6850
},
{
"epoch": 0.3150684931506849,
"grad_norm": 6.89371395111084,
"learning_rate": 2.5273972602739727e-05,
"loss": 1.1224,
"step": 6900
},
{
"epoch": 0.317351598173516,
"grad_norm": 76.80335235595703,
"learning_rate": 2.523972602739726e-05,
"loss": 0.9386,
"step": 6950
},
{
"epoch": 0.319634703196347,
"grad_norm": 50.3594970703125,
"learning_rate": 2.5205479452054793e-05,
"loss": 1.0721,
"step": 7000
},
{
"epoch": 0.3219178082191781,
"grad_norm": 26.805213928222656,
"learning_rate": 2.517123287671233e-05,
"loss": 1.2563,
"step": 7050
},
{
"epoch": 0.3242009132420091,
"grad_norm": 39.83240509033203,
"learning_rate": 2.5136986301369862e-05,
"loss": 1.0924,
"step": 7100
},
{
"epoch": 0.3264840182648402,
"grad_norm": 41.219818115234375,
"learning_rate": 2.51027397260274e-05,
"loss": 1.0771,
"step": 7150
},
{
"epoch": 0.3287671232876712,
"grad_norm": 63.68681335449219,
"learning_rate": 2.5068493150684932e-05,
"loss": 1.1793,
"step": 7200
},
{
"epoch": 0.3310502283105023,
"grad_norm": 96.54421997070312,
"learning_rate": 2.5034246575342465e-05,
"loss": 1.185,
"step": 7250
},
{
"epoch": 0.3333333333333333,
"grad_norm": 35.43101119995117,
"learning_rate": 2.5e-05,
"loss": 1.3505,
"step": 7300
},
{
"epoch": 0.3356164383561644,
"grad_norm": 26.726581573486328,
"learning_rate": 2.4965753424657534e-05,
"loss": 1.0832,
"step": 7350
},
{
"epoch": 0.3378995433789954,
"grad_norm": 22.066526412963867,
"learning_rate": 2.4931506849315067e-05,
"loss": 1.2593,
"step": 7400
},
{
"epoch": 0.3401826484018265,
"grad_norm": 9.076361656188965,
"learning_rate": 2.4897260273972604e-05,
"loss": 0.9935,
"step": 7450
},
{
"epoch": 0.3424657534246575,
"grad_norm": 32.710853576660156,
"learning_rate": 2.4863013698630137e-05,
"loss": 1.2397,
"step": 7500
},
{
"epoch": 0.3447488584474886,
"grad_norm": 24.153223037719727,
"learning_rate": 2.4828767123287673e-05,
"loss": 1.1828,
"step": 7550
},
{
"epoch": 0.3470319634703196,
"grad_norm": 21.1640682220459,
"learning_rate": 2.4794520547945206e-05,
"loss": 1.2872,
"step": 7600
},
{
"epoch": 0.3493150684931507,
"grad_norm": 8.183709144592285,
"learning_rate": 2.476027397260274e-05,
"loss": 1.0255,
"step": 7650
},
{
"epoch": 0.3515981735159817,
"grad_norm": 32.05094528198242,
"learning_rate": 2.4726027397260276e-05,
"loss": 1.1408,
"step": 7700
},
{
"epoch": 0.3538812785388128,
"grad_norm": 42.780330657958984,
"learning_rate": 2.469178082191781e-05,
"loss": 1.0889,
"step": 7750
},
{
"epoch": 0.3561643835616438,
"grad_norm": 12.330619812011719,
"learning_rate": 2.4657534246575342e-05,
"loss": 1.1555,
"step": 7800
},
{
"epoch": 0.3584474885844749,
"grad_norm": 52.48133850097656,
"learning_rate": 2.4623287671232878e-05,
"loss": 0.9871,
"step": 7850
},
{
"epoch": 0.3607305936073059,
"grad_norm": 17.115123748779297,
"learning_rate": 2.458904109589041e-05,
"loss": 1.2171,
"step": 7900
},
{
"epoch": 0.363013698630137,
"grad_norm": 24.72197914123535,
"learning_rate": 2.4554794520547948e-05,
"loss": 0.9396,
"step": 7950
},
{
"epoch": 0.365296803652968,
"grad_norm": 59.79161834716797,
"learning_rate": 2.452054794520548e-05,
"loss": 1.161,
"step": 8000
},
{
"epoch": 0.3675799086757991,
"grad_norm": 27.012365341186523,
"learning_rate": 2.4486301369863014e-05,
"loss": 1.2269,
"step": 8050
},
{
"epoch": 0.3698630136986301,
"grad_norm": 10.282275199890137,
"learning_rate": 2.445205479452055e-05,
"loss": 1.0582,
"step": 8100
},
{
"epoch": 0.3721461187214612,
"grad_norm": 16.697723388671875,
"learning_rate": 2.4417808219178083e-05,
"loss": 1.1504,
"step": 8150
},
{
"epoch": 0.3744292237442922,
"grad_norm": 19.30514144897461,
"learning_rate": 2.4383561643835616e-05,
"loss": 1.0279,
"step": 8200
},
{
"epoch": 0.3767123287671233,
"grad_norm": 49.45153045654297,
"learning_rate": 2.4349315068493153e-05,
"loss": 1.0799,
"step": 8250
},
{
"epoch": 0.3789954337899543,
"grad_norm": 33.181060791015625,
"learning_rate": 2.4315068493150686e-05,
"loss": 1.1191,
"step": 8300
},
{
"epoch": 0.3812785388127854,
"grad_norm": 79.8065185546875,
"learning_rate": 2.4280821917808222e-05,
"loss": 1.1442,
"step": 8350
},
{
"epoch": 0.3835616438356164,
"grad_norm": 48.79990005493164,
"learning_rate": 2.4246575342465755e-05,
"loss": 1.349,
"step": 8400
},
{
"epoch": 0.3858447488584475,
"grad_norm": 56.24677658081055,
"learning_rate": 2.4212328767123288e-05,
"loss": 1.0238,
"step": 8450
},
{
"epoch": 0.3881278538812785,
"grad_norm": 12.791101455688477,
"learning_rate": 2.4178082191780825e-05,
"loss": 1.0045,
"step": 8500
},
{
"epoch": 0.3904109589041096,
"grad_norm": 13.716742515563965,
"learning_rate": 2.4143835616438358e-05,
"loss": 1.0964,
"step": 8550
},
{
"epoch": 0.3926940639269406,
"grad_norm": 6.694267272949219,
"learning_rate": 2.410958904109589e-05,
"loss": 0.9396,
"step": 8600
},
{
"epoch": 0.3949771689497717,
"grad_norm": 19.237689971923828,
"learning_rate": 2.4075342465753427e-05,
"loss": 1.0931,
"step": 8650
},
{
"epoch": 0.3972602739726027,
"grad_norm": 23.344566345214844,
"learning_rate": 2.404109589041096e-05,
"loss": 1.3482,
"step": 8700
},
{
"epoch": 0.3995433789954338,
"grad_norm": 9.369404792785645,
"learning_rate": 2.4006849315068497e-05,
"loss": 1.0205,
"step": 8750
},
{
"epoch": 0.4018264840182648,
"grad_norm": 20.9238224029541,
"learning_rate": 2.397260273972603e-05,
"loss": 1.0606,
"step": 8800
},
{
"epoch": 0.4041095890410959,
"grad_norm": 66.00227355957031,
"learning_rate": 2.3938356164383563e-05,
"loss": 1.103,
"step": 8850
},
{
"epoch": 0.4063926940639269,
"grad_norm": 5.158641815185547,
"learning_rate": 2.39041095890411e-05,
"loss": 1.0207,
"step": 8900
},
{
"epoch": 0.408675799086758,
"grad_norm": 19.67837142944336,
"learning_rate": 2.3869863013698632e-05,
"loss": 1.108,
"step": 8950
},
{
"epoch": 0.410958904109589,
"grad_norm": 48.31447219848633,
"learning_rate": 2.3835616438356165e-05,
"loss": 1.0708,
"step": 9000
},
{
"epoch": 0.4132420091324201,
"grad_norm": 18.173908233642578,
"learning_rate": 2.38013698630137e-05,
"loss": 1.0449,
"step": 9050
},
{
"epoch": 0.4155251141552511,
"grad_norm": 57.505226135253906,
"learning_rate": 2.376712328767123e-05,
"loss": 1.1911,
"step": 9100
},
{
"epoch": 0.4178082191780822,
"grad_norm": 39.649169921875,
"learning_rate": 2.3732876712328768e-05,
"loss": 1.1708,
"step": 9150
},
{
"epoch": 0.4200913242009132,
"grad_norm": 7.622274398803711,
"learning_rate": 2.36986301369863e-05,
"loss": 0.9928,
"step": 9200
},
{
"epoch": 0.4223744292237443,
"grad_norm": 17.452634811401367,
"learning_rate": 2.3664383561643834e-05,
"loss": 0.9558,
"step": 9250
},
{
"epoch": 0.4246575342465753,
"grad_norm": 59.85329055786133,
"learning_rate": 2.363013698630137e-05,
"loss": 1.0805,
"step": 9300
},
{
"epoch": 0.4269406392694064,
"grad_norm": 9.634559631347656,
"learning_rate": 2.3595890410958903e-05,
"loss": 0.9829,
"step": 9350
},
{
"epoch": 0.4292237442922374,
"grad_norm": 1.924180507659912,
"learning_rate": 2.3561643835616436e-05,
"loss": 1.0737,
"step": 9400
},
{
"epoch": 0.4315068493150685,
"grad_norm": 31.276365280151367,
"learning_rate": 2.3527397260273973e-05,
"loss": 1.0905,
"step": 9450
},
{
"epoch": 0.4337899543378995,
"grad_norm": 29.218658447265625,
"learning_rate": 2.3493150684931506e-05,
"loss": 1.2167,
"step": 9500
},
{
"epoch": 0.4360730593607306,
"grad_norm": 42.0178108215332,
"learning_rate": 2.3458904109589042e-05,
"loss": 1.0009,
"step": 9550
},
{
"epoch": 0.4383561643835616,
"grad_norm": 36.67307662963867,
"learning_rate": 2.3424657534246575e-05,
"loss": 1.1016,
"step": 9600
},
{
"epoch": 0.4406392694063927,
"grad_norm": 21.634462356567383,
"learning_rate": 2.339041095890411e-05,
"loss": 1.0895,
"step": 9650
},
{
"epoch": 0.4429223744292237,
"grad_norm": 20.32236671447754,
"learning_rate": 2.3356164383561645e-05,
"loss": 1.1135,
"step": 9700
},
{
"epoch": 0.4452054794520548,
"grad_norm": 5.574302673339844,
"learning_rate": 2.3321917808219178e-05,
"loss": 1.1313,
"step": 9750
},
{
"epoch": 0.4474885844748858,
"grad_norm": 5.582201957702637,
"learning_rate": 2.328767123287671e-05,
"loss": 1.0072,
"step": 9800
},
{
"epoch": 0.4497716894977169,
"grad_norm": 58.499603271484375,
"learning_rate": 2.3253424657534247e-05,
"loss": 1.2044,
"step": 9850
},
{
"epoch": 0.4520547945205479,
"grad_norm": 18.887069702148438,
"learning_rate": 2.321917808219178e-05,
"loss": 1.0175,
"step": 9900
},
{
"epoch": 0.454337899543379,
"grad_norm": 33.383880615234375,
"learning_rate": 2.3184931506849317e-05,
"loss": 1.1733,
"step": 9950
},
{
"epoch": 0.45662100456621,
"grad_norm": 85.34075927734375,
"learning_rate": 2.315068493150685e-05,
"loss": 1.1876,
"step": 10000
},
{
"epoch": 0.4589041095890411,
"grad_norm": 22.957134246826172,
"learning_rate": 2.3116438356164383e-05,
"loss": 1.074,
"step": 10050
},
{
"epoch": 0.4611872146118721,
"grad_norm": 57.47038269042969,
"learning_rate": 2.308219178082192e-05,
"loss": 0.9007,
"step": 10100
},
{
"epoch": 0.4634703196347032,
"grad_norm": 11.614636421203613,
"learning_rate": 2.3047945205479452e-05,
"loss": 1.0877,
"step": 10150
},
{
"epoch": 0.4657534246575342,
"grad_norm": 8.970778465270996,
"learning_rate": 2.3013698630136985e-05,
"loss": 1.3286,
"step": 10200
},
{
"epoch": 0.4680365296803653,
"grad_norm": 43.425514221191406,
"learning_rate": 2.2979452054794522e-05,
"loss": 1.1883,
"step": 10250
},
{
"epoch": 0.4703196347031963,
"grad_norm": 12.85531234741211,
"learning_rate": 2.2945205479452055e-05,
"loss": 1.0921,
"step": 10300
},
{
"epoch": 0.4726027397260274,
"grad_norm": 6.603804111480713,
"learning_rate": 2.291095890410959e-05,
"loss": 1.1883,
"step": 10350
},
{
"epoch": 0.4748858447488584,
"grad_norm": 47.8962516784668,
"learning_rate": 2.2876712328767124e-05,
"loss": 1.1814,
"step": 10400
},
{
"epoch": 0.4771689497716895,
"grad_norm": 59.406280517578125,
"learning_rate": 2.2842465753424657e-05,
"loss": 1.115,
"step": 10450
},
{
"epoch": 0.4794520547945205,
"grad_norm": 43.54423522949219,
"learning_rate": 2.2808219178082194e-05,
"loss": 0.998,
"step": 10500
},
{
"epoch": 0.4817351598173516,
"grad_norm": 17.37261962890625,
"learning_rate": 2.2773972602739727e-05,
"loss": 0.9542,
"step": 10550
},
{
"epoch": 0.4840182648401826,
"grad_norm": 38.04684829711914,
"learning_rate": 2.273972602739726e-05,
"loss": 1.0195,
"step": 10600
},
{
"epoch": 0.4863013698630137,
"grad_norm": 17.202608108520508,
"learning_rate": 2.2705479452054796e-05,
"loss": 1.109,
"step": 10650
},
{
"epoch": 0.4885844748858447,
"grad_norm": 4.1588897705078125,
"learning_rate": 2.267123287671233e-05,
"loss": 1.0125,
"step": 10700
},
{
"epoch": 0.4908675799086758,
"grad_norm": 10.435503005981445,
"learning_rate": 2.2636986301369866e-05,
"loss": 1.0044,
"step": 10750
},
{
"epoch": 0.4931506849315068,
"grad_norm": 62.7014045715332,
"learning_rate": 2.26027397260274e-05,
"loss": 1.0457,
"step": 10800
},
{
"epoch": 0.4954337899543379,
"grad_norm": 19.47702407836914,
"learning_rate": 2.2568493150684932e-05,
"loss": 1.0468,
"step": 10850
},
{
"epoch": 0.4977168949771689,
"grad_norm": 42.62397766113281,
"learning_rate": 2.253424657534247e-05,
"loss": 1.0374,
"step": 10900
},
{
"epoch": 0.5,
"grad_norm": 52.27951431274414,
"learning_rate": 2.25e-05,
"loss": 1.0967,
"step": 10950
},
{
"epoch": 0.502283105022831,
"grad_norm": 8.714099884033203,
"learning_rate": 2.2465753424657534e-05,
"loss": 1.2492,
"step": 11000
},
{
"epoch": 0.5045662100456622,
"grad_norm": 10.670646667480469,
"learning_rate": 2.243150684931507e-05,
"loss": 1.23,
"step": 11050
},
{
"epoch": 0.5068493150684932,
"grad_norm": 30.391632080078125,
"learning_rate": 2.2397260273972604e-05,
"loss": 1.3576,
"step": 11100
},
{
"epoch": 0.5091324200913242,
"grad_norm": 36.597408294677734,
"learning_rate": 2.2363013698630137e-05,
"loss": 0.9834,
"step": 11150
},
{
"epoch": 0.5114155251141552,
"grad_norm": 58.925941467285156,
"learning_rate": 2.2328767123287673e-05,
"loss": 1.139,
"step": 11200
},
{
"epoch": 0.5136986301369864,
"grad_norm": 52.20880126953125,
"learning_rate": 2.2294520547945206e-05,
"loss": 1.1037,
"step": 11250
},
{
"epoch": 0.5159817351598174,
"grad_norm": 104.13248443603516,
"learning_rate": 2.2260273972602743e-05,
"loss": 1.0124,
"step": 11300
},
{
"epoch": 0.5182648401826484,
"grad_norm": 37.718406677246094,
"learning_rate": 2.2226027397260276e-05,
"loss": 1.1873,
"step": 11350
},
{
"epoch": 0.5205479452054794,
"grad_norm": 31.402559280395508,
"learning_rate": 2.219178082191781e-05,
"loss": 1.1506,
"step": 11400
},
{
"epoch": 0.5228310502283106,
"grad_norm": 7.707183361053467,
"learning_rate": 2.2157534246575345e-05,
"loss": 1.2423,
"step": 11450
},
{
"epoch": 0.5251141552511416,
"grad_norm": 45.80585861206055,
"learning_rate": 2.212328767123288e-05,
"loss": 1.1457,
"step": 11500
},
{
"epoch": 0.5273972602739726,
"grad_norm": 58.7805061340332,
"learning_rate": 2.208904109589041e-05,
"loss": 1.0302,
"step": 11550
},
{
"epoch": 0.5296803652968036,
"grad_norm": 29.0611515045166,
"learning_rate": 2.2054794520547945e-05,
"loss": 1.1852,
"step": 11600
},
{
"epoch": 0.5319634703196348,
"grad_norm": 17.17108917236328,
"learning_rate": 2.2020547945205478e-05,
"loss": 1.0164,
"step": 11650
},
{
"epoch": 0.5342465753424658,
"grad_norm": 28.332836151123047,
"learning_rate": 2.1986301369863014e-05,
"loss": 0.968,
"step": 11700
},
{
"epoch": 0.5365296803652968,
"grad_norm": 7.278358459472656,
"learning_rate": 2.1952054794520547e-05,
"loss": 1.2058,
"step": 11750
},
{
"epoch": 0.5388127853881278,
"grad_norm": 15.584250450134277,
"learning_rate": 2.191780821917808e-05,
"loss": 1.0892,
"step": 11800
},
{
"epoch": 0.541095890410959,
"grad_norm": 22.4052791595459,
"learning_rate": 2.1883561643835617e-05,
"loss": 0.9984,
"step": 11850
},
{
"epoch": 0.54337899543379,
"grad_norm": 31.585800170898438,
"learning_rate": 2.184931506849315e-05,
"loss": 1.2338,
"step": 11900
},
{
"epoch": 0.545662100456621,
"grad_norm": 19.224178314208984,
"learning_rate": 2.1815068493150683e-05,
"loss": 1.0994,
"step": 11950
},
{
"epoch": 0.547945205479452,
"grad_norm": 15.44568157196045,
"learning_rate": 2.178082191780822e-05,
"loss": 1.0938,
"step": 12000
},
{
"epoch": 0.5502283105022832,
"grad_norm": 9.826760292053223,
"learning_rate": 2.1746575342465752e-05,
"loss": 1.1607,
"step": 12050
},
{
"epoch": 0.5525114155251142,
"grad_norm": 8.327254295349121,
"learning_rate": 2.171232876712329e-05,
"loss": 1.0973,
"step": 12100
},
{
"epoch": 0.5547945205479452,
"grad_norm": 39.339962005615234,
"learning_rate": 2.167808219178082e-05,
"loss": 1.1066,
"step": 12150
},
{
"epoch": 0.5570776255707762,
"grad_norm": 27.773502349853516,
"learning_rate": 2.1643835616438355e-05,
"loss": 0.9664,
"step": 12200
},
{
"epoch": 0.5593607305936074,
"grad_norm": 34.17534637451172,
"learning_rate": 2.160958904109589e-05,
"loss": 1.2806,
"step": 12250
},
{
"epoch": 0.5616438356164384,
"grad_norm": 14.751806259155273,
"learning_rate": 2.1575342465753424e-05,
"loss": 1.0973,
"step": 12300
},
{
"epoch": 0.5639269406392694,
"grad_norm": 39.10814666748047,
"learning_rate": 2.1541095890410957e-05,
"loss": 1.0821,
"step": 12350
},
{
"epoch": 0.5662100456621004,
"grad_norm": 14.468631744384766,
"learning_rate": 2.1506849315068494e-05,
"loss": 0.8218,
"step": 12400
},
{
"epoch": 0.5684931506849316,
"grad_norm": 30.599267959594727,
"learning_rate": 2.1472602739726027e-05,
"loss": 1.0412,
"step": 12450
},
{
"epoch": 0.5707762557077626,
"grad_norm": 15.382964134216309,
"learning_rate": 2.1438356164383563e-05,
"loss": 0.9351,
"step": 12500
},
{
"epoch": 0.5730593607305936,
"grad_norm": 44.646480560302734,
"learning_rate": 2.1404109589041096e-05,
"loss": 1.1423,
"step": 12550
},
{
"epoch": 0.5753424657534246,
"grad_norm": 40.618309020996094,
"learning_rate": 2.136986301369863e-05,
"loss": 1.0657,
"step": 12600
},
{
"epoch": 0.5776255707762558,
"grad_norm": 50.3376579284668,
"learning_rate": 2.1335616438356166e-05,
"loss": 0.9775,
"step": 12650
},
{
"epoch": 0.5799086757990868,
"grad_norm": 60.17688751220703,
"learning_rate": 2.13013698630137e-05,
"loss": 1.1291,
"step": 12700
},
{
"epoch": 0.5821917808219178,
"grad_norm": 11.148223876953125,
"learning_rate": 2.126712328767123e-05,
"loss": 1.1556,
"step": 12750
},
{
"epoch": 0.5844748858447488,
"grad_norm": 20.931018829345703,
"learning_rate": 2.1232876712328768e-05,
"loss": 1.0614,
"step": 12800
},
{
"epoch": 0.58675799086758,
"grad_norm": 29.79435157775879,
"learning_rate": 2.11986301369863e-05,
"loss": 1.2341,
"step": 12850
},
{
"epoch": 0.589041095890411,
"grad_norm": 27.72561264038086,
"learning_rate": 2.1164383561643838e-05,
"loss": 0.9783,
"step": 12900
},
{
"epoch": 0.591324200913242,
"grad_norm": 6.564560890197754,
"learning_rate": 2.113013698630137e-05,
"loss": 1.0293,
"step": 12950
},
{
"epoch": 0.593607305936073,
"grad_norm": 68.2934799194336,
"learning_rate": 2.1095890410958904e-05,
"loss": 1.1324,
"step": 13000
},
{
"epoch": 0.5958904109589042,
"grad_norm": 8.194302558898926,
"learning_rate": 2.106164383561644e-05,
"loss": 0.9099,
"step": 13050
},
{
"epoch": 0.5981735159817352,
"grad_norm": 25.813316345214844,
"learning_rate": 2.1027397260273973e-05,
"loss": 1.0368,
"step": 13100
},
{
"epoch": 0.6004566210045662,
"grad_norm": 19.83176612854004,
"learning_rate": 2.0993150684931506e-05,
"loss": 0.9496,
"step": 13150
},
{
"epoch": 0.6027397260273972,
"grad_norm": 17.05171012878418,
"learning_rate": 2.0958904109589043e-05,
"loss": 1.1386,
"step": 13200
},
{
"epoch": 0.6050228310502284,
"grad_norm": 12.28995418548584,
"learning_rate": 2.0924657534246576e-05,
"loss": 1.1897,
"step": 13250
},
{
"epoch": 0.6073059360730594,
"grad_norm": 22.48783302307129,
"learning_rate": 2.0890410958904112e-05,
"loss": 1.1413,
"step": 13300
},
{
"epoch": 0.6095890410958904,
"grad_norm": 37.44598388671875,
"learning_rate": 2.0856164383561645e-05,
"loss": 1.1138,
"step": 13350
},
{
"epoch": 0.6118721461187214,
"grad_norm": 19.184656143188477,
"learning_rate": 2.0821917808219178e-05,
"loss": 1.0924,
"step": 13400
},
{
"epoch": 0.6141552511415526,
"grad_norm": 30.34543228149414,
"learning_rate": 2.0787671232876715e-05,
"loss": 1.0382,
"step": 13450
},
{
"epoch": 0.6164383561643836,
"grad_norm": 18.820228576660156,
"learning_rate": 2.0753424657534248e-05,
"loss": 1.0618,
"step": 13500
},
{
"epoch": 0.6187214611872146,
"grad_norm": 3.0790977478027344,
"learning_rate": 2.071917808219178e-05,
"loss": 1.0412,
"step": 13550
},
{
"epoch": 0.6210045662100456,
"grad_norm": 42.972923278808594,
"learning_rate": 2.0684931506849317e-05,
"loss": 0.9701,
"step": 13600
},
{
"epoch": 0.6232876712328768,
"grad_norm": 13.229798316955566,
"learning_rate": 2.065068493150685e-05,
"loss": 0.9746,
"step": 13650
},
{
"epoch": 0.6255707762557078,
"grad_norm": 87.67366790771484,
"learning_rate": 2.0616438356164387e-05,
"loss": 1.1411,
"step": 13700
},
{
"epoch": 0.6278538812785388,
"grad_norm": 12.436323165893555,
"learning_rate": 2.058219178082192e-05,
"loss": 1.208,
"step": 13750
},
{
"epoch": 0.6301369863013698,
"grad_norm": 19.009395599365234,
"learning_rate": 2.0547945205479453e-05,
"loss": 1.0635,
"step": 13800
},
{
"epoch": 0.632420091324201,
"grad_norm": 6.680107593536377,
"learning_rate": 2.051369863013699e-05,
"loss": 1.0463,
"step": 13850
},
{
"epoch": 0.634703196347032,
"grad_norm": 11.398260116577148,
"learning_rate": 2.0479452054794522e-05,
"loss": 0.9179,
"step": 13900
},
{
"epoch": 0.636986301369863,
"grad_norm": 4.983315467834473,
"learning_rate": 2.0445205479452055e-05,
"loss": 1.0699,
"step": 13950
},
{
"epoch": 0.639269406392694,
"grad_norm": 29.882274627685547,
"learning_rate": 2.041095890410959e-05,
"loss": 1.1485,
"step": 14000
},
{
"epoch": 0.6415525114155252,
"grad_norm": 9.104654312133789,
"learning_rate": 2.0376712328767125e-05,
"loss": 0.9783,
"step": 14050
},
{
"epoch": 0.6438356164383562,
"grad_norm": 8.866716384887695,
"learning_rate": 2.034246575342466e-05,
"loss": 0.9843,
"step": 14100
},
{
"epoch": 0.6461187214611872,
"grad_norm": 20.7504825592041,
"learning_rate": 2.0308219178082194e-05,
"loss": 0.9999,
"step": 14150
},
{
"epoch": 0.6484018264840182,
"grad_norm": 16.239980697631836,
"learning_rate": 2.0273972602739724e-05,
"loss": 0.9305,
"step": 14200
},
{
"epoch": 0.6506849315068494,
"grad_norm": 52.316673278808594,
"learning_rate": 2.023972602739726e-05,
"loss": 0.9185,
"step": 14250
},
{
"epoch": 0.6529680365296804,
"grad_norm": 13.330124855041504,
"learning_rate": 2.0205479452054793e-05,
"loss": 0.9943,
"step": 14300
},
{
"epoch": 0.6552511415525114,
"grad_norm": 13.177003860473633,
"learning_rate": 2.0171232876712326e-05,
"loss": 0.7542,
"step": 14350
},
{
"epoch": 0.6575342465753424,
"grad_norm": 36.442108154296875,
"learning_rate": 2.0136986301369863e-05,
"loss": 0.8722,
"step": 14400
},
{
"epoch": 0.6598173515981736,
"grad_norm": 26.251983642578125,
"learning_rate": 2.0102739726027396e-05,
"loss": 0.9657,
"step": 14450
},
{
"epoch": 0.6621004566210046,
"grad_norm": 35.95085144042969,
"learning_rate": 2.0068493150684932e-05,
"loss": 1.0656,
"step": 14500
},
{
"epoch": 0.6643835616438356,
"grad_norm": 29.169719696044922,
"learning_rate": 2.0034246575342465e-05,
"loss": 1.076,
"step": 14550
},
{
"epoch": 0.6666666666666666,
"grad_norm": 49.64678955078125,
"learning_rate": 1.9999999999999998e-05,
"loss": 0.9267,
"step": 14600
},
{
"epoch": 0.6689497716894978,
"grad_norm": 25.249004364013672,
"learning_rate": 1.9965753424657535e-05,
"loss": 1.1205,
"step": 14650
},
{
"epoch": 0.6712328767123288,
"grad_norm": 21.96654510498047,
"learning_rate": 1.9931506849315068e-05,
"loss": 0.8928,
"step": 14700
},
{
"epoch": 0.6735159817351598,
"grad_norm": 78.21800994873047,
"learning_rate": 1.98972602739726e-05,
"loss": 0.9646,
"step": 14750
},
{
"epoch": 0.6757990867579908,
"grad_norm": 49.2692756652832,
"learning_rate": 1.9863013698630137e-05,
"loss": 1.0115,
"step": 14800
},
{
"epoch": 0.678082191780822,
"grad_norm": 35.43125915527344,
"learning_rate": 1.982876712328767e-05,
"loss": 1.0473,
"step": 14850
},
{
"epoch": 0.680365296803653,
"grad_norm": 18.292205810546875,
"learning_rate": 1.9794520547945207e-05,
"loss": 0.7609,
"step": 14900
},
{
"epoch": 0.682648401826484,
"grad_norm": 18.42896842956543,
"learning_rate": 1.976027397260274e-05,
"loss": 0.989,
"step": 14950
},
{
"epoch": 0.684931506849315,
"grad_norm": 25.584640502929688,
"learning_rate": 1.9726027397260273e-05,
"loss": 1.1391,
"step": 15000
},
{
"epoch": 0.6872146118721462,
"grad_norm": 12.116101264953613,
"learning_rate": 1.969178082191781e-05,
"loss": 1.09,
"step": 15050
},
{
"epoch": 0.6894977168949772,
"grad_norm": 4.520615100860596,
"learning_rate": 1.9657534246575342e-05,
"loss": 0.9834,
"step": 15100
},
{
"epoch": 0.6917808219178082,
"grad_norm": 16.461069107055664,
"learning_rate": 1.9623287671232875e-05,
"loss": 0.8777,
"step": 15150
},
{
"epoch": 0.6940639269406392,
"grad_norm": 41.71174240112305,
"learning_rate": 1.9589041095890412e-05,
"loss": 1.2619,
"step": 15200
},
{
"epoch": 0.6963470319634704,
"grad_norm": 10.733747482299805,
"learning_rate": 1.9554794520547945e-05,
"loss": 1.0843,
"step": 15250
},
{
"epoch": 0.6986301369863014,
"grad_norm": 34.77432632446289,
"learning_rate": 1.952054794520548e-05,
"loss": 0.9292,
"step": 15300
},
{
"epoch": 0.7009132420091324,
"grad_norm": 46.95319747924805,
"learning_rate": 1.9486301369863014e-05,
"loss": 1.0058,
"step": 15350
},
{
"epoch": 0.7031963470319634,
"grad_norm": 1.2955800294876099,
"learning_rate": 1.9452054794520547e-05,
"loss": 0.9672,
"step": 15400
},
{
"epoch": 0.7054794520547946,
"grad_norm": 16.37827491760254,
"learning_rate": 1.9417808219178084e-05,
"loss": 0.9386,
"step": 15450
},
{
"epoch": 0.7077625570776256,
"grad_norm": 41.87287139892578,
"learning_rate": 1.9383561643835617e-05,
"loss": 1.0772,
"step": 15500
},
{
"epoch": 0.7100456621004566,
"grad_norm": 8.469085693359375,
"learning_rate": 1.934931506849315e-05,
"loss": 0.9201,
"step": 15550
},
{
"epoch": 0.7123287671232876,
"grad_norm": 5.279160976409912,
"learning_rate": 1.9315068493150686e-05,
"loss": 1.2076,
"step": 15600
},
{
"epoch": 0.7146118721461188,
"grad_norm": 24.052440643310547,
"learning_rate": 1.928082191780822e-05,
"loss": 1.079,
"step": 15650
},
{
"epoch": 0.7168949771689498,
"grad_norm": 43.76991271972656,
"learning_rate": 1.9246575342465756e-05,
"loss": 0.9909,
"step": 15700
},
{
"epoch": 0.7191780821917808,
"grad_norm": 0.4643230438232422,
"learning_rate": 1.921232876712329e-05,
"loss": 1.0314,
"step": 15750
},
{
"epoch": 0.7214611872146118,
"grad_norm": 31.16683006286621,
"learning_rate": 1.9178082191780822e-05,
"loss": 1.0446,
"step": 15800
},
{
"epoch": 0.723744292237443,
"grad_norm": 2.468339443206787,
"learning_rate": 1.9143835616438358e-05,
"loss": 0.9801,
"step": 15850
},
{
"epoch": 0.726027397260274,
"grad_norm": 18.055511474609375,
"learning_rate": 1.910958904109589e-05,
"loss": 0.9679,
"step": 15900
},
{
"epoch": 0.728310502283105,
"grad_norm": 11.843624114990234,
"learning_rate": 1.9075342465753424e-05,
"loss": 1.049,
"step": 15950
},
{
"epoch": 0.730593607305936,
"grad_norm": 28.83637237548828,
"learning_rate": 1.904109589041096e-05,
"loss": 1.0862,
"step": 16000
},
{
"epoch": 0.7328767123287672,
"grad_norm": 41.827579498291016,
"learning_rate": 1.9006849315068494e-05,
"loss": 0.9812,
"step": 16050
},
{
"epoch": 0.7351598173515982,
"grad_norm": 19.110126495361328,
"learning_rate": 1.897260273972603e-05,
"loss": 1.2085,
"step": 16100
},
{
"epoch": 0.7374429223744292,
"grad_norm": 7.440555572509766,
"learning_rate": 1.8938356164383563e-05,
"loss": 1.0045,
"step": 16150
},
{
"epoch": 0.7397260273972602,
"grad_norm": 14.894591331481934,
"learning_rate": 1.8904109589041096e-05,
"loss": 1.0214,
"step": 16200
},
{
"epoch": 0.7420091324200914,
"grad_norm": 50.265140533447266,
"learning_rate": 1.8869863013698633e-05,
"loss": 1.0115,
"step": 16250
},
{
"epoch": 0.7442922374429224,
"grad_norm": 48.24840545654297,
"learning_rate": 1.8835616438356166e-05,
"loss": 1.022,
"step": 16300
},
{
"epoch": 0.7465753424657534,
"grad_norm": 23.087678909301758,
"learning_rate": 1.88013698630137e-05,
"loss": 0.9043,
"step": 16350
},
{
"epoch": 0.7488584474885844,
"grad_norm": 41.646461486816406,
"learning_rate": 1.8767123287671235e-05,
"loss": 1.0964,
"step": 16400
},
{
"epoch": 0.7511415525114156,
"grad_norm": 18.607566833496094,
"learning_rate": 1.8732876712328768e-05,
"loss": 1.1307,
"step": 16450
},
{
"epoch": 0.7534246575342466,
"grad_norm": 19.668004989624023,
"learning_rate": 1.8698630136986305e-05,
"loss": 0.9917,
"step": 16500
},
{
"epoch": 0.7557077625570776,
"grad_norm": 81.84849548339844,
"learning_rate": 1.8664383561643838e-05,
"loss": 1.1245,
"step": 16550
},
{
"epoch": 0.7579908675799086,
"grad_norm": 17.170801162719727,
"learning_rate": 1.863013698630137e-05,
"loss": 0.8943,
"step": 16600
},
{
"epoch": 0.7602739726027398,
"grad_norm": 20.587200164794922,
"learning_rate": 1.8595890410958907e-05,
"loss": 0.9715,
"step": 16650
},
{
"epoch": 0.7625570776255708,
"grad_norm": 65.86396026611328,
"learning_rate": 1.856164383561644e-05,
"loss": 1.0448,
"step": 16700
},
{
"epoch": 0.7648401826484018,
"grad_norm": 16.04901695251465,
"learning_rate": 1.8527397260273973e-05,
"loss": 1.0682,
"step": 16750
},
{
"epoch": 0.7671232876712328,
"grad_norm": 19.407167434692383,
"learning_rate": 1.8493150684931506e-05,
"loss": 0.9607,
"step": 16800
},
{
"epoch": 0.769406392694064,
"grad_norm": 13.71429443359375,
"learning_rate": 1.845890410958904e-05,
"loss": 1.0604,
"step": 16850
},
{
"epoch": 0.771689497716895,
"grad_norm": 51.34487533569336,
"learning_rate": 1.8424657534246576e-05,
"loss": 0.8779,
"step": 16900
},
{
"epoch": 0.773972602739726,
"grad_norm": 13.411955833435059,
"learning_rate": 1.839041095890411e-05,
"loss": 1.0547,
"step": 16950
},
{
"epoch": 0.776255707762557,
"grad_norm": 60.41900634765625,
"learning_rate": 1.8356164383561642e-05,
"loss": 1.0141,
"step": 17000
},
{
"epoch": 0.7785388127853882,
"grad_norm": 65.25275421142578,
"learning_rate": 1.832191780821918e-05,
"loss": 1.0541,
"step": 17050
},
{
"epoch": 0.7808219178082192,
"grad_norm": 64.50421905517578,
"learning_rate": 1.828767123287671e-05,
"loss": 1.0467,
"step": 17100
},
{
"epoch": 0.7831050228310502,
"grad_norm": 35.63545608520508,
"learning_rate": 1.8253424657534244e-05,
"loss": 1.0431,
"step": 17150
},
{
"epoch": 0.7853881278538812,
"grad_norm": 8.089126586914062,
"learning_rate": 1.821917808219178e-05,
"loss": 1.0258,
"step": 17200
},
{
"epoch": 0.7876712328767124,
"grad_norm": 33.44828414916992,
"learning_rate": 1.8184931506849314e-05,
"loss": 0.9516,
"step": 17250
},
{
"epoch": 0.7899543378995434,
"grad_norm": 52.08647537231445,
"learning_rate": 1.815068493150685e-05,
"loss": 1.0529,
"step": 17300
},
{
"epoch": 0.7922374429223744,
"grad_norm": 33.359886169433594,
"learning_rate": 1.8116438356164383e-05,
"loss": 1.0985,
"step": 17350
},
{
"epoch": 0.7945205479452054,
"grad_norm": 43.1573486328125,
"learning_rate": 1.8082191780821916e-05,
"loss": 0.987,
"step": 17400
},
{
"epoch": 0.7968036529680366,
"grad_norm": 41.887718200683594,
"learning_rate": 1.8047945205479453e-05,
"loss": 0.9562,
"step": 17450
},
{
"epoch": 0.7990867579908676,
"grad_norm": 4.935914993286133,
"learning_rate": 1.8013698630136986e-05,
"loss": 1.011,
"step": 17500
},
{
"epoch": 0.8013698630136986,
"grad_norm": 17.437244415283203,
"learning_rate": 1.797945205479452e-05,
"loss": 1.1939,
"step": 17550
},
{
"epoch": 0.8036529680365296,
"grad_norm": 38.83684158325195,
"learning_rate": 1.7945205479452055e-05,
"loss": 1.0831,
"step": 17600
},
{
"epoch": 0.8059360730593608,
"grad_norm": 389.5946960449219,
"learning_rate": 1.791095890410959e-05,
"loss": 0.9442,
"step": 17650
},
{
"epoch": 0.8082191780821918,
"grad_norm": 55.63192367553711,
"learning_rate": 1.7876712328767125e-05,
"loss": 0.8581,
"step": 17700
},
{
"epoch": 0.8105022831050228,
"grad_norm": 12.910749435424805,
"learning_rate": 1.7842465753424658e-05,
"loss": 0.9023,
"step": 17750
},
{
"epoch": 0.8127853881278538,
"grad_norm": 25.35328483581543,
"learning_rate": 1.780821917808219e-05,
"loss": 1.0642,
"step": 17800
},
{
"epoch": 0.815068493150685,
"grad_norm": 14.35108470916748,
"learning_rate": 1.7773972602739727e-05,
"loss": 0.9517,
"step": 17850
},
{
"epoch": 0.817351598173516,
"grad_norm": 16.878704071044922,
"learning_rate": 1.773972602739726e-05,
"loss": 1.0333,
"step": 17900
},
{
"epoch": 0.819634703196347,
"grad_norm": 4.217007160186768,
"learning_rate": 1.7705479452054793e-05,
"loss": 1.048,
"step": 17950
},
{
"epoch": 0.821917808219178,
"grad_norm": 7.6629767417907715,
"learning_rate": 1.767123287671233e-05,
"loss": 0.9847,
"step": 18000
},
{
"epoch": 0.8242009132420092,
"grad_norm": 8.383101463317871,
"learning_rate": 1.7636986301369863e-05,
"loss": 0.8269,
"step": 18050
},
{
"epoch": 0.8264840182648402,
"grad_norm": 21.882862091064453,
"learning_rate": 1.76027397260274e-05,
"loss": 1.0138,
"step": 18100
},
{
"epoch": 0.8287671232876712,
"grad_norm": 15.08014965057373,
"learning_rate": 1.7568493150684932e-05,
"loss": 1.0072,
"step": 18150
},
{
"epoch": 0.8310502283105022,
"grad_norm": 9.188797950744629,
"learning_rate": 1.7534246575342465e-05,
"loss": 0.998,
"step": 18200
},
{
"epoch": 0.8333333333333334,
"grad_norm": 8.108163833618164,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.8591,
"step": 18250
},
{
"epoch": 0.8356164383561644,
"grad_norm": 8.082106590270996,
"learning_rate": 1.7465753424657535e-05,
"loss": 0.9486,
"step": 18300
},
{
"epoch": 0.8378995433789954,
"grad_norm": 9.28232479095459,
"learning_rate": 1.7431506849315068e-05,
"loss": 1.1733,
"step": 18350
},
{
"epoch": 0.8401826484018264,
"grad_norm": 42.009498596191406,
"learning_rate": 1.7397260273972604e-05,
"loss": 1.0863,
"step": 18400
},
{
"epoch": 0.8424657534246576,
"grad_norm": 16.338623046875,
"learning_rate": 1.7363013698630137e-05,
"loss": 0.8428,
"step": 18450
},
{
"epoch": 0.8447488584474886,
"grad_norm": 22.031343460083008,
"learning_rate": 1.7328767123287674e-05,
"loss": 0.9678,
"step": 18500
},
{
"epoch": 0.8470319634703196,
"grad_norm": 60.101600646972656,
"learning_rate": 1.7294520547945207e-05,
"loss": 1.0961,
"step": 18550
},
{
"epoch": 0.8493150684931506,
"grad_norm": 17.335065841674805,
"learning_rate": 1.726027397260274e-05,
"loss": 0.9917,
"step": 18600
},
{
"epoch": 0.8515981735159818,
"grad_norm": 1.8705586194992065,
"learning_rate": 1.7226027397260276e-05,
"loss": 0.9031,
"step": 18650
},
{
"epoch": 0.8538812785388128,
"grad_norm": 8.452926635742188,
"learning_rate": 1.719178082191781e-05,
"loss": 0.9161,
"step": 18700
},
{
"epoch": 0.8561643835616438,
"grad_norm": 22.091020584106445,
"learning_rate": 1.7157534246575342e-05,
"loss": 0.9697,
"step": 18750
},
{
"epoch": 0.8584474885844748,
"grad_norm": 14.892813682556152,
"learning_rate": 1.712328767123288e-05,
"loss": 1.1043,
"step": 18800
},
{
"epoch": 0.860730593607306,
"grad_norm": 5.6726274490356445,
"learning_rate": 1.7089041095890412e-05,
"loss": 0.9272,
"step": 18850
},
{
"epoch": 0.863013698630137,
"grad_norm": 4.598392963409424,
"learning_rate": 1.705479452054795e-05,
"loss": 1.0438,
"step": 18900
},
{
"epoch": 0.865296803652968,
"grad_norm": 10.54848575592041,
"learning_rate": 1.702054794520548e-05,
"loss": 1.068,
"step": 18950
},
{
"epoch": 0.867579908675799,
"grad_norm": 43.7364616394043,
"learning_rate": 1.6986301369863014e-05,
"loss": 0.9575,
"step": 19000
},
{
"epoch": 0.8698630136986302,
"grad_norm": 17.5739803314209,
"learning_rate": 1.695205479452055e-05,
"loss": 0.933,
"step": 19050
},
{
"epoch": 0.8721461187214612,
"grad_norm": 2.4345619678497314,
"learning_rate": 1.6917808219178084e-05,
"loss": 0.8702,
"step": 19100
},
{
"epoch": 0.8744292237442922,
"grad_norm": 27.58513069152832,
"learning_rate": 1.6883561643835617e-05,
"loss": 0.9842,
"step": 19150
},
{
"epoch": 0.8767123287671232,
"grad_norm": 44.58627700805664,
"learning_rate": 1.6849315068493153e-05,
"loss": 0.9117,
"step": 19200
},
{
"epoch": 0.8789954337899544,
"grad_norm": 23.16618537902832,
"learning_rate": 1.6815068493150686e-05,
"loss": 0.9699,
"step": 19250
},
{
"epoch": 0.8812785388127854,
"grad_norm": 10.979578971862793,
"learning_rate": 1.6780821917808223e-05,
"loss": 0.9397,
"step": 19300
},
{
"epoch": 0.8835616438356164,
"grad_norm": 12.799473762512207,
"learning_rate": 1.6746575342465753e-05,
"loss": 1.0797,
"step": 19350
},
{
"epoch": 0.8858447488584474,
"grad_norm": 6.065539836883545,
"learning_rate": 1.6712328767123286e-05,
"loss": 1.1177,
"step": 19400
},
{
"epoch": 0.8881278538812786,
"grad_norm": 14.423439979553223,
"learning_rate": 1.6678082191780822e-05,
"loss": 0.8818,
"step": 19450
},
{
"epoch": 0.8904109589041096,
"grad_norm": 1.1675946712493896,
"learning_rate": 1.6643835616438355e-05,
"loss": 0.8819,
"step": 19500
},
{
"epoch": 0.8926940639269406,
"grad_norm": 18.09140396118164,
"learning_rate": 1.6609589041095888e-05,
"loss": 0.9183,
"step": 19550
},
{
"epoch": 0.8949771689497716,
"grad_norm": 11.98416805267334,
"learning_rate": 1.6575342465753425e-05,
"loss": 1.0715,
"step": 19600
},
{
"epoch": 0.8972602739726028,
"grad_norm": 16.66376495361328,
"learning_rate": 1.6541095890410958e-05,
"loss": 0.7896,
"step": 19650
},
{
"epoch": 0.8995433789954338,
"grad_norm": 17.945493698120117,
"learning_rate": 1.6506849315068494e-05,
"loss": 0.8961,
"step": 19700
},
{
"epoch": 0.9018264840182648,
"grad_norm": 34.556827545166016,
"learning_rate": 1.6472602739726027e-05,
"loss": 0.9543,
"step": 19750
},
{
"epoch": 0.9041095890410958,
"grad_norm": 16.06093406677246,
"learning_rate": 1.643835616438356e-05,
"loss": 0.913,
"step": 19800
},
{
"epoch": 0.906392694063927,
"grad_norm": 28.835208892822266,
"learning_rate": 1.6404109589041096e-05,
"loss": 1.068,
"step": 19850
},
{
"epoch": 0.908675799086758,
"grad_norm": 3.6414895057678223,
"learning_rate": 1.636986301369863e-05,
"loss": 0.9695,
"step": 19900
},
{
"epoch": 0.910958904109589,
"grad_norm": 16.142841339111328,
"learning_rate": 1.6335616438356163e-05,
"loss": 1.0057,
"step": 19950
},
{
"epoch": 0.91324200913242,
"grad_norm": 15.585680961608887,
"learning_rate": 1.63013698630137e-05,
"loss": 0.964,
"step": 20000
},
{
"epoch": 0.9155251141552512,
"grad_norm": 23.79058265686035,
"learning_rate": 1.6267123287671232e-05,
"loss": 0.8243,
"step": 20050
},
{
"epoch": 0.9178082191780822,
"grad_norm": 14.258800506591797,
"learning_rate": 1.623287671232877e-05,
"loss": 0.9247,
"step": 20100
},
{
"epoch": 0.9200913242009132,
"grad_norm": 16.076623916625977,
"learning_rate": 1.61986301369863e-05,
"loss": 0.8446,
"step": 20150
},
{
"epoch": 0.9223744292237442,
"grad_norm": 19.330589294433594,
"learning_rate": 1.6164383561643835e-05,
"loss": 1.0295,
"step": 20200
},
{
"epoch": 0.9246575342465754,
"grad_norm": 29.074434280395508,
"learning_rate": 1.613013698630137e-05,
"loss": 1.0043,
"step": 20250
},
{
"epoch": 0.9269406392694064,
"grad_norm": 19.37755012512207,
"learning_rate": 1.6095890410958904e-05,
"loss": 0.8284,
"step": 20300
},
{
"epoch": 0.9292237442922374,
"grad_norm": 17.540924072265625,
"learning_rate": 1.6061643835616437e-05,
"loss": 0.9585,
"step": 20350
},
{
"epoch": 0.9315068493150684,
"grad_norm": 15.943336486816406,
"learning_rate": 1.6027397260273974e-05,
"loss": 0.9534,
"step": 20400
},
{
"epoch": 0.9337899543378996,
"grad_norm": 14.471035957336426,
"learning_rate": 1.5993150684931507e-05,
"loss": 0.9346,
"step": 20450
},
{
"epoch": 0.9360730593607306,
"grad_norm": 2.5866785049438477,
"learning_rate": 1.5958904109589043e-05,
"loss": 0.874,
"step": 20500
},
{
"epoch": 0.9383561643835616,
"grad_norm": 23.115089416503906,
"learning_rate": 1.5924657534246576e-05,
"loss": 1.0657,
"step": 20550
},
{
"epoch": 0.9406392694063926,
"grad_norm": 37.99090576171875,
"learning_rate": 1.589041095890411e-05,
"loss": 0.8484,
"step": 20600
},
{
"epoch": 0.9429223744292238,
"grad_norm": 8.818378448486328,
"learning_rate": 1.5856164383561646e-05,
"loss": 0.8682,
"step": 20650
},
{
"epoch": 0.9452054794520548,
"grad_norm": 64.82261657714844,
"learning_rate": 1.582191780821918e-05,
"loss": 0.9238,
"step": 20700
},
{
"epoch": 0.9474885844748858,
"grad_norm": 4.82705545425415,
"learning_rate": 1.578767123287671e-05,
"loss": 1.0183,
"step": 20750
},
{
"epoch": 0.9497716894977168,
"grad_norm": 35.37221908569336,
"learning_rate": 1.5753424657534248e-05,
"loss": 0.9661,
"step": 20800
},
{
"epoch": 0.952054794520548,
"grad_norm": 59.23238754272461,
"learning_rate": 1.571917808219178e-05,
"loss": 1.1756,
"step": 20850
},
{
"epoch": 0.954337899543379,
"grad_norm": 5.600244045257568,
"learning_rate": 1.5684931506849318e-05,
"loss": 0.9528,
"step": 20900
},
{
"epoch": 0.95662100456621,
"grad_norm": 36.54751968383789,
"learning_rate": 1.565068493150685e-05,
"loss": 0.8113,
"step": 20950
},
{
"epoch": 0.958904109589041,
"grad_norm": 21.206743240356445,
"learning_rate": 1.5616438356164384e-05,
"loss": 1.0184,
"step": 21000
},
{
"epoch": 0.9611872146118722,
"grad_norm": 5.483078479766846,
"learning_rate": 1.558219178082192e-05,
"loss": 1.0164,
"step": 21050
},
{
"epoch": 0.9634703196347032,
"grad_norm": 10.294635772705078,
"learning_rate": 1.5547945205479453e-05,
"loss": 0.8765,
"step": 21100
},
{
"epoch": 0.9657534246575342,
"grad_norm": 15.811431884765625,
"learning_rate": 1.5513698630136986e-05,
"loss": 1.0893,
"step": 21150
},
{
"epoch": 0.9680365296803652,
"grad_norm": 12.22139835357666,
"learning_rate": 1.5479452054794523e-05,
"loss": 0.8607,
"step": 21200
},
{
"epoch": 0.9703196347031964,
"grad_norm": 32.24748229980469,
"learning_rate": 1.5445205479452056e-05,
"loss": 0.9461,
"step": 21250
},
{
"epoch": 0.9726027397260274,
"grad_norm": 105.09614562988281,
"learning_rate": 1.5410958904109592e-05,
"loss": 1.0007,
"step": 21300
},
{
"epoch": 0.9748858447488584,
"grad_norm": 39.1617546081543,
"learning_rate": 1.5376712328767125e-05,
"loss": 0.8074,
"step": 21350
},
{
"epoch": 0.9771689497716894,
"grad_norm": 54.6515007019043,
"learning_rate": 1.5342465753424658e-05,
"loss": 1.0728,
"step": 21400
},
{
"epoch": 0.9794520547945206,
"grad_norm": 11.429344177246094,
"learning_rate": 1.5308219178082195e-05,
"loss": 1.0846,
"step": 21450
},
{
"epoch": 0.9817351598173516,
"grad_norm": 35.74282455444336,
"learning_rate": 1.5273972602739728e-05,
"loss": 0.9805,
"step": 21500
},
{
"epoch": 0.9840182648401826,
"grad_norm": 28.15215301513672,
"learning_rate": 1.5239726027397259e-05,
"loss": 1.057,
"step": 21550
},
{
"epoch": 0.9863013698630136,
"grad_norm": 24.915042877197266,
"learning_rate": 1.5205479452054795e-05,
"loss": 0.9953,
"step": 21600
},
{
"epoch": 0.9885844748858448,
"grad_norm": 54.70051574707031,
"learning_rate": 1.5171232876712328e-05,
"loss": 0.8781,
"step": 21650
},
{
"epoch": 0.9908675799086758,
"grad_norm": 57.645572662353516,
"learning_rate": 1.5136986301369865e-05,
"loss": 0.9031,
"step": 21700
},
{
"epoch": 0.9931506849315068,
"grad_norm": 49.95241165161133,
"learning_rate": 1.5102739726027398e-05,
"loss": 1.1313,
"step": 21750
},
{
"epoch": 0.9954337899543378,
"grad_norm": 11.169427871704102,
"learning_rate": 1.5068493150684931e-05,
"loss": 0.7831,
"step": 21800
},
{
"epoch": 0.997716894977169,
"grad_norm": 1.9000864028930664,
"learning_rate": 1.5034246575342467e-05,
"loss": 0.9135,
"step": 21850
},
{
"epoch": 1.0,
"grad_norm": 8.12338638305664,
"learning_rate": 1.5e-05,
"loss": 0.8517,
"step": 21900
},
{
"epoch": 1.0022831050228311,
"grad_norm": 9.123151779174805,
"learning_rate": 1.4965753424657535e-05,
"loss": 0.8109,
"step": 21950
},
{
"epoch": 1.004566210045662,
"grad_norm": 1.5693093538284302,
"learning_rate": 1.4931506849315068e-05,
"loss": 0.637,
"step": 22000
},
{
"epoch": 1.0068493150684932,
"grad_norm": 57.09550094604492,
"learning_rate": 1.4897260273972603e-05,
"loss": 0.7138,
"step": 22050
},
{
"epoch": 1.009132420091324,
"grad_norm": 22.930618286132812,
"learning_rate": 1.4863013698630138e-05,
"loss": 0.5267,
"step": 22100
},
{
"epoch": 1.0114155251141552,
"grad_norm": 6.480524063110352,
"learning_rate": 1.4828767123287672e-05,
"loss": 0.583,
"step": 22150
},
{
"epoch": 1.0136986301369864,
"grad_norm": 8.956890106201172,
"learning_rate": 1.4794520547945205e-05,
"loss": 0.838,
"step": 22200
},
{
"epoch": 1.0159817351598173,
"grad_norm": 28.80997657775879,
"learning_rate": 1.476027397260274e-05,
"loss": 0.7791,
"step": 22250
},
{
"epoch": 1.0182648401826484,
"grad_norm": 51.440643310546875,
"learning_rate": 1.4726027397260275e-05,
"loss": 0.6699,
"step": 22300
},
{
"epoch": 1.0205479452054795,
"grad_norm": 16.62401580810547,
"learning_rate": 1.469178082191781e-05,
"loss": 0.6056,
"step": 22350
},
{
"epoch": 1.0228310502283104,
"grad_norm": 6.103922367095947,
"learning_rate": 1.4657534246575343e-05,
"loss": 0.7163,
"step": 22400
},
{
"epoch": 1.0251141552511416,
"grad_norm": 13.347216606140137,
"learning_rate": 1.4623287671232877e-05,
"loss": 0.8158,
"step": 22450
},
{
"epoch": 1.0273972602739727,
"grad_norm": 10.218559265136719,
"learning_rate": 1.4589041095890412e-05,
"loss": 0.7067,
"step": 22500
},
{
"epoch": 1.0296803652968036,
"grad_norm": 20.360658645629883,
"learning_rate": 1.4554794520547945e-05,
"loss": 0.5413,
"step": 22550
},
{
"epoch": 1.0319634703196348,
"grad_norm": 14.813282012939453,
"learning_rate": 1.4520547945205478e-05,
"loss": 0.7121,
"step": 22600
},
{
"epoch": 1.0342465753424657,
"grad_norm": 4.041282653808594,
"learning_rate": 1.4486301369863013e-05,
"loss": 0.6259,
"step": 22650
},
{
"epoch": 1.0365296803652968,
"grad_norm": 8.718100547790527,
"learning_rate": 1.4452054794520548e-05,
"loss": 0.679,
"step": 22700
},
{
"epoch": 1.038812785388128,
"grad_norm": 24.674726486206055,
"learning_rate": 1.4417808219178082e-05,
"loss": 0.5784,
"step": 22750
},
{
"epoch": 1.0410958904109588,
"grad_norm": 18.543848037719727,
"learning_rate": 1.4383561643835615e-05,
"loss": 0.6259,
"step": 22800
},
{
"epoch": 1.04337899543379,
"grad_norm": 11.60615348815918,
"learning_rate": 1.434931506849315e-05,
"loss": 0.6859,
"step": 22850
},
{
"epoch": 1.045662100456621,
"grad_norm": 19.409120559692383,
"learning_rate": 1.4315068493150685e-05,
"loss": 0.724,
"step": 22900
},
{
"epoch": 1.047945205479452,
"grad_norm": 18.087451934814453,
"learning_rate": 1.428082191780822e-05,
"loss": 0.6719,
"step": 22950
},
{
"epoch": 1.0502283105022832,
"grad_norm": 37.11878967285156,
"learning_rate": 1.4246575342465753e-05,
"loss": 0.5994,
"step": 23000
},
{
"epoch": 1.052511415525114,
"grad_norm": 35.999149322509766,
"learning_rate": 1.4212328767123287e-05,
"loss": 0.8059,
"step": 23050
},
{
"epoch": 1.0547945205479452,
"grad_norm": 23.739049911499023,
"learning_rate": 1.4178082191780822e-05,
"loss": 0.4798,
"step": 23100
},
{
"epoch": 1.0570776255707763,
"grad_norm": 4.680600166320801,
"learning_rate": 1.4143835616438357e-05,
"loss": 0.6557,
"step": 23150
},
{
"epoch": 1.0593607305936072,
"grad_norm": 31.171585083007812,
"learning_rate": 1.410958904109589e-05,
"loss": 0.6122,
"step": 23200
},
{
"epoch": 1.0616438356164384,
"grad_norm": 10.616646766662598,
"learning_rate": 1.4075342465753425e-05,
"loss": 0.7256,
"step": 23250
},
{
"epoch": 1.0639269406392695,
"grad_norm": 5.910991191864014,
"learning_rate": 1.404109589041096e-05,
"loss": 0.5245,
"step": 23300
},
{
"epoch": 1.0662100456621004,
"grad_norm": 6.659880638122559,
"learning_rate": 1.4006849315068494e-05,
"loss": 0.6245,
"step": 23350
},
{
"epoch": 1.0684931506849316,
"grad_norm": 28.779626846313477,
"learning_rate": 1.3972602739726027e-05,
"loss": 0.7664,
"step": 23400
},
{
"epoch": 1.0707762557077625,
"grad_norm": 42.86344909667969,
"learning_rate": 1.3938356164383562e-05,
"loss": 0.7317,
"step": 23450
},
{
"epoch": 1.0730593607305936,
"grad_norm": 17.784263610839844,
"learning_rate": 1.3904109589041097e-05,
"loss": 0.7032,
"step": 23500
},
{
"epoch": 1.0753424657534247,
"grad_norm": 33.24246597290039,
"learning_rate": 1.3869863013698631e-05,
"loss": 0.6477,
"step": 23550
},
{
"epoch": 1.0776255707762556,
"grad_norm": 23.703388214111328,
"learning_rate": 1.3835616438356164e-05,
"loss": 0.5103,
"step": 23600
},
{
"epoch": 1.0799086757990868,
"grad_norm": 27.350101470947266,
"learning_rate": 1.38013698630137e-05,
"loss": 0.7417,
"step": 23650
},
{
"epoch": 1.0821917808219177,
"grad_norm": 32.2100715637207,
"learning_rate": 1.3767123287671234e-05,
"loss": 0.9106,
"step": 23700
},
{
"epoch": 1.0844748858447488,
"grad_norm": 8.949542999267578,
"learning_rate": 1.3732876712328769e-05,
"loss": 0.6949,
"step": 23750
},
{
"epoch": 1.08675799086758,
"grad_norm": 31.792240142822266,
"learning_rate": 1.3698630136986302e-05,
"loss": 0.725,
"step": 23800
},
{
"epoch": 1.0890410958904109,
"grad_norm": 51.95265579223633,
"learning_rate": 1.3664383561643835e-05,
"loss": 0.73,
"step": 23850
},
{
"epoch": 1.091324200913242,
"grad_norm": 19.113056182861328,
"learning_rate": 1.363013698630137e-05,
"loss": 0.6004,
"step": 23900
},
{
"epoch": 1.0936073059360731,
"grad_norm": 16.85398292541504,
"learning_rate": 1.3595890410958904e-05,
"loss": 0.5843,
"step": 23950
},
{
"epoch": 1.095890410958904,
"grad_norm": 20.591157913208008,
"learning_rate": 1.3561643835616437e-05,
"loss": 0.6938,
"step": 24000
},
{
"epoch": 1.0981735159817352,
"grad_norm": 22.592805862426758,
"learning_rate": 1.3527397260273972e-05,
"loss": 0.6366,
"step": 24050
},
{
"epoch": 1.1004566210045663,
"grad_norm": 77.25360870361328,
"learning_rate": 1.3493150684931507e-05,
"loss": 0.7245,
"step": 24100
},
{
"epoch": 1.1027397260273972,
"grad_norm": 35.447452545166016,
"learning_rate": 1.3458904109589042e-05,
"loss": 0.7001,
"step": 24150
},
{
"epoch": 1.1050228310502284,
"grad_norm": 50.135650634765625,
"learning_rate": 1.3424657534246575e-05,
"loss": 0.6473,
"step": 24200
},
{
"epoch": 1.1073059360730593,
"grad_norm": 9.847333908081055,
"learning_rate": 1.339041095890411e-05,
"loss": 0.6206,
"step": 24250
},
{
"epoch": 1.1095890410958904,
"grad_norm": 16.841964721679688,
"learning_rate": 1.3356164383561644e-05,
"loss": 0.754,
"step": 24300
},
{
"epoch": 1.1118721461187215,
"grad_norm": 25.09712028503418,
"learning_rate": 1.3321917808219179e-05,
"loss": 0.7678,
"step": 24350
},
{
"epoch": 1.1141552511415524,
"grad_norm": 20.83018684387207,
"learning_rate": 1.3287671232876712e-05,
"loss": 0.7375,
"step": 24400
},
{
"epoch": 1.1164383561643836,
"grad_norm": 1.3051903247833252,
"learning_rate": 1.3253424657534247e-05,
"loss": 0.73,
"step": 24450
},
{
"epoch": 1.1187214611872145,
"grad_norm": 21.558069229125977,
"learning_rate": 1.3219178082191781e-05,
"loss": 0.6535,
"step": 24500
},
{
"epoch": 1.1210045662100456,
"grad_norm": 53.48995590209961,
"learning_rate": 1.3184931506849316e-05,
"loss": 0.7592,
"step": 24550
},
{
"epoch": 1.1232876712328768,
"grad_norm": 6.8114166259765625,
"learning_rate": 1.3150684931506849e-05,
"loss": 0.6735,
"step": 24600
},
{
"epoch": 1.1255707762557077,
"grad_norm": 16.559179306030273,
"learning_rate": 1.3116438356164384e-05,
"loss": 0.4556,
"step": 24650
},
{
"epoch": 1.1278538812785388,
"grad_norm": 2.124957323074341,
"learning_rate": 1.3082191780821919e-05,
"loss": 0.92,
"step": 24700
},
{
"epoch": 1.13013698630137,
"grad_norm": 34.67999267578125,
"learning_rate": 1.3047945205479453e-05,
"loss": 0.6309,
"step": 24750
},
{
"epoch": 1.1324200913242009,
"grad_norm": 9.184309005737305,
"learning_rate": 1.3013698630136986e-05,
"loss": 0.7584,
"step": 24800
},
{
"epoch": 1.134703196347032,
"grad_norm": 17.552547454833984,
"learning_rate": 1.2979452054794521e-05,
"loss": 0.6098,
"step": 24850
},
{
"epoch": 1.1369863013698631,
"grad_norm": 37.24542999267578,
"learning_rate": 1.2945205479452056e-05,
"loss": 0.8336,
"step": 24900
},
{
"epoch": 1.139269406392694,
"grad_norm": 52.120880126953125,
"learning_rate": 1.291095890410959e-05,
"loss": 0.6372,
"step": 24950
},
{
"epoch": 1.1415525114155252,
"grad_norm": 13.773005485534668,
"learning_rate": 1.2876712328767124e-05,
"loss": 0.6734,
"step": 25000
},
{
"epoch": 1.143835616438356,
"grad_norm": 12.726426124572754,
"learning_rate": 1.2842465753424658e-05,
"loss": 0.5249,
"step": 25050
},
{
"epoch": 1.1461187214611872,
"grad_norm": 8.800257682800293,
"learning_rate": 1.2808219178082193e-05,
"loss": 0.5094,
"step": 25100
},
{
"epoch": 1.1484018264840183,
"grad_norm": 77.74162292480469,
"learning_rate": 1.2773972602739726e-05,
"loss": 0.7281,
"step": 25150
},
{
"epoch": 1.1506849315068493,
"grad_norm": 8.394207954406738,
"learning_rate": 1.2739726027397259e-05,
"loss": 0.615,
"step": 25200
},
{
"epoch": 1.1529680365296804,
"grad_norm": 62.93704605102539,
"learning_rate": 1.2705479452054794e-05,
"loss": 0.5909,
"step": 25250
},
{
"epoch": 1.1552511415525113,
"grad_norm": 18.461288452148438,
"learning_rate": 1.2671232876712329e-05,
"loss": 0.6578,
"step": 25300
},
{
"epoch": 1.1575342465753424,
"grad_norm": 17.798723220825195,
"learning_rate": 1.2636986301369863e-05,
"loss": 0.7566,
"step": 25350
},
{
"epoch": 1.1598173515981736,
"grad_norm": 0.32706037163734436,
"learning_rate": 1.2602739726027396e-05,
"loss": 0.6193,
"step": 25400
},
{
"epoch": 1.1621004566210045,
"grad_norm": 19.030363082885742,
"learning_rate": 1.2568493150684931e-05,
"loss": 0.6249,
"step": 25450
},
{
"epoch": 1.1643835616438356,
"grad_norm": 0.1700681447982788,
"learning_rate": 1.2534246575342466e-05,
"loss": 0.7782,
"step": 25500
},
{
"epoch": 1.1666666666666667,
"grad_norm": 147.6623077392578,
"learning_rate": 1.25e-05,
"loss": 0.6307,
"step": 25550
},
{
"epoch": 1.1689497716894977,
"grad_norm": 13.00705623626709,
"learning_rate": 1.2465753424657534e-05,
"loss": 0.6638,
"step": 25600
},
{
"epoch": 1.1712328767123288,
"grad_norm": 10.626334190368652,
"learning_rate": 1.2431506849315068e-05,
"loss": 0.7118,
"step": 25650
},
{
"epoch": 1.17351598173516,
"grad_norm": 5.709634304046631,
"learning_rate": 1.2397260273972603e-05,
"loss": 0.6299,
"step": 25700
},
{
"epoch": 1.1757990867579908,
"grad_norm": 30.29009246826172,
"learning_rate": 1.2363013698630138e-05,
"loss": 0.7486,
"step": 25750
},
{
"epoch": 1.178082191780822,
"grad_norm": 18.62900733947754,
"learning_rate": 1.2328767123287671e-05,
"loss": 0.5936,
"step": 25800
},
{
"epoch": 1.1803652968036529,
"grad_norm": 14.033234596252441,
"learning_rate": 1.2294520547945206e-05,
"loss": 0.7004,
"step": 25850
},
{
"epoch": 1.182648401826484,
"grad_norm": 38.063873291015625,
"learning_rate": 1.226027397260274e-05,
"loss": 0.7239,
"step": 25900
},
{
"epoch": 1.1849315068493151,
"grad_norm": 40.94581985473633,
"learning_rate": 1.2226027397260275e-05,
"loss": 0.6232,
"step": 25950
},
{
"epoch": 1.187214611872146,
"grad_norm": 10.695091247558594,
"learning_rate": 1.2191780821917808e-05,
"loss": 0.8526,
"step": 26000
},
{
"epoch": 1.1894977168949772,
"grad_norm": 19.597412109375,
"learning_rate": 1.2157534246575343e-05,
"loss": 0.6878,
"step": 26050
},
{
"epoch": 1.191780821917808,
"grad_norm": 57.820003509521484,
"learning_rate": 1.2123287671232878e-05,
"loss": 0.7689,
"step": 26100
},
{
"epoch": 1.1940639269406392,
"grad_norm": 19.449691772460938,
"learning_rate": 1.2089041095890412e-05,
"loss": 0.6267,
"step": 26150
},
{
"epoch": 1.1963470319634704,
"grad_norm": 30.782194137573242,
"learning_rate": 1.2054794520547945e-05,
"loss": 0.6165,
"step": 26200
},
{
"epoch": 1.1986301369863013,
"grad_norm": 27.298837661743164,
"learning_rate": 1.202054794520548e-05,
"loss": 0.8057,
"step": 26250
},
{
"epoch": 1.2009132420091324,
"grad_norm": 185.9980010986328,
"learning_rate": 1.1986301369863015e-05,
"loss": 0.7694,
"step": 26300
},
{
"epoch": 1.2031963470319635,
"grad_norm": 3.1694886684417725,
"learning_rate": 1.195205479452055e-05,
"loss": 0.6598,
"step": 26350
},
{
"epoch": 1.2054794520547945,
"grad_norm": 6.1167097091674805,
"learning_rate": 1.1917808219178083e-05,
"loss": 0.6464,
"step": 26400
},
{
"epoch": 1.2077625570776256,
"grad_norm": 15.646038055419922,
"learning_rate": 1.1883561643835616e-05,
"loss": 0.9328,
"step": 26450
},
{
"epoch": 1.2100456621004567,
"grad_norm": 7.978808403015137,
"learning_rate": 1.184931506849315e-05,
"loss": 0.6384,
"step": 26500
},
{
"epoch": 1.2123287671232876,
"grad_norm": 21.067642211914062,
"learning_rate": 1.1815068493150685e-05,
"loss": 0.8629,
"step": 26550
},
{
"epoch": 1.2146118721461188,
"grad_norm": 27.418428421020508,
"learning_rate": 1.1780821917808218e-05,
"loss": 0.6208,
"step": 26600
},
{
"epoch": 1.2168949771689497,
"grad_norm": 10.672882080078125,
"learning_rate": 1.1746575342465753e-05,
"loss": 0.5777,
"step": 26650
},
{
"epoch": 1.2191780821917808,
"grad_norm": 82.25353240966797,
"learning_rate": 1.1712328767123288e-05,
"loss": 0.9331,
"step": 26700
},
{
"epoch": 1.221461187214612,
"grad_norm": 12.071101188659668,
"learning_rate": 1.1678082191780822e-05,
"loss": 0.7766,
"step": 26750
},
{
"epoch": 1.2237442922374429,
"grad_norm": 23.268993377685547,
"learning_rate": 1.1643835616438355e-05,
"loss": 0.6875,
"step": 26800
},
{
"epoch": 1.226027397260274,
"grad_norm": 39.18050003051758,
"learning_rate": 1.160958904109589e-05,
"loss": 0.7285,
"step": 26850
},
{
"epoch": 1.228310502283105,
"grad_norm": 3.805318832397461,
"learning_rate": 1.1575342465753425e-05,
"loss": 0.6275,
"step": 26900
},
{
"epoch": 1.230593607305936,
"grad_norm": 39.39872741699219,
"learning_rate": 1.154109589041096e-05,
"loss": 0.6889,
"step": 26950
},
{
"epoch": 1.2328767123287672,
"grad_norm": 16.79233169555664,
"learning_rate": 1.1506849315068493e-05,
"loss": 0.5247,
"step": 27000
},
{
"epoch": 1.235159817351598,
"grad_norm": 30.6214656829834,
"learning_rate": 1.1472602739726027e-05,
"loss": 0.5567,
"step": 27050
},
{
"epoch": 1.2374429223744292,
"grad_norm": 5.248475551605225,
"learning_rate": 1.1438356164383562e-05,
"loss": 0.7133,
"step": 27100
},
{
"epoch": 1.2397260273972603,
"grad_norm": 16.61432456970215,
"learning_rate": 1.1404109589041097e-05,
"loss": 0.6672,
"step": 27150
},
{
"epoch": 1.2420091324200913,
"grad_norm": 16.011980056762695,
"learning_rate": 1.136986301369863e-05,
"loss": 0.7677,
"step": 27200
},
{
"epoch": 1.2442922374429224,
"grad_norm": 34.770938873291016,
"learning_rate": 1.1335616438356165e-05,
"loss": 0.6866,
"step": 27250
},
{
"epoch": 1.2465753424657535,
"grad_norm": 39.278892517089844,
"learning_rate": 1.13013698630137e-05,
"loss": 0.7042,
"step": 27300
},
{
"epoch": 1.2488584474885844,
"grad_norm": 21.77368927001953,
"learning_rate": 1.1267123287671234e-05,
"loss": 0.6923,
"step": 27350
},
{
"epoch": 1.2511415525114156,
"grad_norm": 8.305987358093262,
"learning_rate": 1.1232876712328767e-05,
"loss": 0.6222,
"step": 27400
},
{
"epoch": 1.2534246575342465,
"grad_norm": 27.238414764404297,
"learning_rate": 1.1198630136986302e-05,
"loss": 0.6661,
"step": 27450
},
{
"epoch": 1.2557077625570776,
"grad_norm": 25.92155647277832,
"learning_rate": 1.1164383561643837e-05,
"loss": 0.7408,
"step": 27500
},
{
"epoch": 1.2579908675799087,
"grad_norm": 16.504167556762695,
"learning_rate": 1.1130136986301371e-05,
"loss": 0.5134,
"step": 27550
},
{
"epoch": 1.2602739726027397,
"grad_norm": 33.571617126464844,
"learning_rate": 1.1095890410958904e-05,
"loss": 0.7309,
"step": 27600
},
{
"epoch": 1.2625570776255708,
"grad_norm": 10.731171607971191,
"learning_rate": 1.106164383561644e-05,
"loss": 0.7008,
"step": 27650
},
{
"epoch": 1.2648401826484017,
"grad_norm": 9.138223648071289,
"learning_rate": 1.1027397260273972e-05,
"loss": 0.7206,
"step": 27700
},
{
"epoch": 1.2671232876712328,
"grad_norm": 19.61602210998535,
"learning_rate": 1.0993150684931507e-05,
"loss": 0.7169,
"step": 27750
},
{
"epoch": 1.269406392694064,
"grad_norm": 25.721527099609375,
"learning_rate": 1.095890410958904e-05,
"loss": 0.6631,
"step": 27800
},
{
"epoch": 1.271689497716895,
"grad_norm": 15.044477462768555,
"learning_rate": 1.0924657534246575e-05,
"loss": 0.6167,
"step": 27850
},
{
"epoch": 1.273972602739726,
"grad_norm": 13.365096092224121,
"learning_rate": 1.089041095890411e-05,
"loss": 0.5629,
"step": 27900
},
{
"epoch": 1.2762557077625571,
"grad_norm": 26.571229934692383,
"learning_rate": 1.0856164383561644e-05,
"loss": 0.6986,
"step": 27950
},
{
"epoch": 1.278538812785388,
"grad_norm": 23.07392692565918,
"learning_rate": 1.0821917808219177e-05,
"loss": 0.6209,
"step": 28000
},
{
"epoch": 1.2808219178082192,
"grad_norm": 1.8312214612960815,
"learning_rate": 1.0787671232876712e-05,
"loss": 0.7247,
"step": 28050
},
{
"epoch": 1.2831050228310503,
"grad_norm": 3.6338565349578857,
"learning_rate": 1.0753424657534247e-05,
"loss": 0.896,
"step": 28100
},
{
"epoch": 1.2853881278538812,
"grad_norm": 22.483678817749023,
"learning_rate": 1.0719178082191782e-05,
"loss": 0.7087,
"step": 28150
},
{
"epoch": 1.2876712328767124,
"grad_norm": 29.26844024658203,
"learning_rate": 1.0684931506849315e-05,
"loss": 0.7095,
"step": 28200
},
{
"epoch": 1.2899543378995433,
"grad_norm": 0.29415565729141235,
"learning_rate": 1.065068493150685e-05,
"loss": 0.5849,
"step": 28250
},
{
"epoch": 1.2922374429223744,
"grad_norm": 0.7176687121391296,
"learning_rate": 1.0616438356164384e-05,
"loss": 0.657,
"step": 28300
},
{
"epoch": 1.2945205479452055,
"grad_norm": 11.345915794372559,
"learning_rate": 1.0582191780821919e-05,
"loss": 0.5598,
"step": 28350
},
{
"epoch": 1.2968036529680365,
"grad_norm": 35.81932067871094,
"learning_rate": 1.0547945205479452e-05,
"loss": 0.8267,
"step": 28400
},
{
"epoch": 1.2990867579908676,
"grad_norm": 30.678194046020508,
"learning_rate": 1.0513698630136987e-05,
"loss": 0.6508,
"step": 28450
},
{
"epoch": 1.3013698630136985,
"grad_norm": 3.55430006980896,
"learning_rate": 1.0479452054794521e-05,
"loss": 0.6641,
"step": 28500
},
{
"epoch": 1.3036529680365296,
"grad_norm": 18.566883087158203,
"learning_rate": 1.0445205479452056e-05,
"loss": 0.7889,
"step": 28550
},
{
"epoch": 1.3059360730593608,
"grad_norm": 19.019390106201172,
"learning_rate": 1.0410958904109589e-05,
"loss": 0.6177,
"step": 28600
},
{
"epoch": 1.308219178082192,
"grad_norm": 17.26028823852539,
"learning_rate": 1.0376712328767124e-05,
"loss": 0.7045,
"step": 28650
},
{
"epoch": 1.3105022831050228,
"grad_norm": 33.43575668334961,
"learning_rate": 1.0342465753424659e-05,
"loss": 0.6046,
"step": 28700
},
{
"epoch": 1.312785388127854,
"grad_norm": 51.09929275512695,
"learning_rate": 1.0308219178082193e-05,
"loss": 0.5056,
"step": 28750
},
{
"epoch": 1.3150684931506849,
"grad_norm": 17.530790328979492,
"learning_rate": 1.0273972602739726e-05,
"loss": 0.6507,
"step": 28800
},
{
"epoch": 1.317351598173516,
"grad_norm": 26.142900466918945,
"learning_rate": 1.0239726027397261e-05,
"loss": 0.5859,
"step": 28850
},
{
"epoch": 1.3196347031963471,
"grad_norm": 16.933874130249023,
"learning_rate": 1.0205479452054796e-05,
"loss": 0.7274,
"step": 28900
},
{
"epoch": 1.321917808219178,
"grad_norm": 5.472013473510742,
"learning_rate": 1.017123287671233e-05,
"loss": 0.6445,
"step": 28950
},
{
"epoch": 1.3242009132420092,
"grad_norm": 20.826509475708008,
"learning_rate": 1.0136986301369862e-05,
"loss": 0.7194,
"step": 29000
},
{
"epoch": 1.32648401826484,
"grad_norm": 36.42399597167969,
"learning_rate": 1.0102739726027397e-05,
"loss": 0.6405,
"step": 29050
},
{
"epoch": 1.3287671232876712,
"grad_norm": 17.444469451904297,
"learning_rate": 1.0068493150684931e-05,
"loss": 0.7348,
"step": 29100
},
{
"epoch": 1.3310502283105023,
"grad_norm": 12.610512733459473,
"learning_rate": 1.0034246575342466e-05,
"loss": 0.6521,
"step": 29150
},
{
"epoch": 1.3333333333333333,
"grad_norm": 14.737506866455078,
"learning_rate": 9.999999999999999e-06,
"loss": 0.6979,
"step": 29200
},
{
"epoch": 1.3356164383561644,
"grad_norm": 27.808490753173828,
"learning_rate": 9.965753424657534e-06,
"loss": 0.7839,
"step": 29250
},
{
"epoch": 1.3378995433789953,
"grad_norm": 7.368427753448486,
"learning_rate": 9.931506849315069e-06,
"loss": 0.6117,
"step": 29300
},
{
"epoch": 1.3401826484018264,
"grad_norm": 27.834564208984375,
"learning_rate": 9.897260273972603e-06,
"loss": 0.6565,
"step": 29350
},
{
"epoch": 1.3424657534246576,
"grad_norm": 33.823081970214844,
"learning_rate": 9.863013698630136e-06,
"loss": 0.6681,
"step": 29400
},
{
"epoch": 1.3447488584474887,
"grad_norm": 26.18691635131836,
"learning_rate": 9.828767123287671e-06,
"loss": 0.9009,
"step": 29450
},
{
"epoch": 1.3470319634703196,
"grad_norm": 20.92985725402832,
"learning_rate": 9.794520547945206e-06,
"loss": 0.5297,
"step": 29500
},
{
"epoch": 1.3493150684931507,
"grad_norm": 4.244528770446777,
"learning_rate": 9.76027397260274e-06,
"loss": 0.6733,
"step": 29550
},
{
"epoch": 1.3515981735159817,
"grad_norm": 67.45909118652344,
"learning_rate": 9.726027397260274e-06,
"loss": 0.575,
"step": 29600
},
{
"epoch": 1.3538812785388128,
"grad_norm": 26.713943481445312,
"learning_rate": 9.691780821917808e-06,
"loss": 0.7428,
"step": 29650
},
{
"epoch": 1.356164383561644,
"grad_norm": 8.588297843933105,
"learning_rate": 9.657534246575343e-06,
"loss": 0.5567,
"step": 29700
},
{
"epoch": 1.3584474885844748,
"grad_norm": 22.027597427368164,
"learning_rate": 9.623287671232878e-06,
"loss": 0.7798,
"step": 29750
},
{
"epoch": 1.360730593607306,
"grad_norm": 10.512967109680176,
"learning_rate": 9.589041095890411e-06,
"loss": 0.5646,
"step": 29800
},
{
"epoch": 1.3630136986301369,
"grad_norm": 9.836832046508789,
"learning_rate": 9.554794520547946e-06,
"loss": 0.6904,
"step": 29850
},
{
"epoch": 1.365296803652968,
"grad_norm": 74.73491668701172,
"learning_rate": 9.52054794520548e-06,
"loss": 0.7029,
"step": 29900
},
{
"epoch": 1.3675799086757991,
"grad_norm": 36.467491149902344,
"learning_rate": 9.486301369863015e-06,
"loss": 0.7266,
"step": 29950
},
{
"epoch": 1.36986301369863,
"grad_norm": 21.993133544921875,
"learning_rate": 9.452054794520548e-06,
"loss": 0.6316,
"step": 30000
},
{
"epoch": 1.3721461187214612,
"grad_norm": 1.2455904483795166,
"learning_rate": 9.417808219178083e-06,
"loss": 0.5555,
"step": 30050
},
{
"epoch": 1.374429223744292,
"grad_norm": 13.85091495513916,
"learning_rate": 9.383561643835618e-06,
"loss": 0.7729,
"step": 30100
},
{
"epoch": 1.3767123287671232,
"grad_norm": 23.61090850830078,
"learning_rate": 9.349315068493152e-06,
"loss": 0.6255,
"step": 30150
},
{
"epoch": 1.3789954337899544,
"grad_norm": 40.24787139892578,
"learning_rate": 9.315068493150685e-06,
"loss": 0.7646,
"step": 30200
},
{
"epoch": 1.3812785388127855,
"grad_norm": 23.249425888061523,
"learning_rate": 9.28082191780822e-06,
"loss": 0.5965,
"step": 30250
},
{
"epoch": 1.3835616438356164,
"grad_norm": 8.090579986572266,
"learning_rate": 9.246575342465753e-06,
"loss": 0.4478,
"step": 30300
},
{
"epoch": 1.3858447488584476,
"grad_norm": 20.011905670166016,
"learning_rate": 9.212328767123288e-06,
"loss": 0.7777,
"step": 30350
},
{
"epoch": 1.3881278538812785,
"grad_norm": 8.697684288024902,
"learning_rate": 9.178082191780821e-06,
"loss": 0.6304,
"step": 30400
},
{
"epoch": 1.3904109589041096,
"grad_norm": 1.545689582824707,
"learning_rate": 9.143835616438356e-06,
"loss": 0.7816,
"step": 30450
},
{
"epoch": 1.3926940639269407,
"grad_norm": 0.6150842308998108,
"learning_rate": 9.10958904109589e-06,
"loss": 0.5802,
"step": 30500
},
{
"epoch": 1.3949771689497716,
"grad_norm": 21.5743350982666,
"learning_rate": 9.075342465753425e-06,
"loss": 0.6369,
"step": 30550
},
{
"epoch": 1.3972602739726028,
"grad_norm": 10.7164306640625,
"learning_rate": 9.041095890410958e-06,
"loss": 0.4902,
"step": 30600
},
{
"epoch": 1.3995433789954337,
"grad_norm": 28.598312377929688,
"learning_rate": 9.006849315068493e-06,
"loss": 0.613,
"step": 30650
},
{
"epoch": 1.4018264840182648,
"grad_norm": 24.138431549072266,
"learning_rate": 8.972602739726028e-06,
"loss": 0.6153,
"step": 30700
},
{
"epoch": 1.404109589041096,
"grad_norm": 22.55198860168457,
"learning_rate": 8.938356164383562e-06,
"loss": 0.5419,
"step": 30750
},
{
"epoch": 1.4063926940639269,
"grad_norm": 26.992374420166016,
"learning_rate": 8.904109589041095e-06,
"loss": 0.5726,
"step": 30800
},
{
"epoch": 1.408675799086758,
"grad_norm": 1.8640153408050537,
"learning_rate": 8.86986301369863e-06,
"loss": 0.6794,
"step": 30850
},
{
"epoch": 1.410958904109589,
"grad_norm": 14.333259582519531,
"learning_rate": 8.835616438356165e-06,
"loss": 0.4972,
"step": 30900
},
{
"epoch": 1.41324200913242,
"grad_norm": 64.34575653076172,
"learning_rate": 8.8013698630137e-06,
"loss": 0.9213,
"step": 30950
},
{
"epoch": 1.4155251141552512,
"grad_norm": 9.60162353515625,
"learning_rate": 8.767123287671233e-06,
"loss": 0.6868,
"step": 31000
},
{
"epoch": 1.4178082191780823,
"grad_norm": 8.155502319335938,
"learning_rate": 8.732876712328767e-06,
"loss": 0.6873,
"step": 31050
},
{
"epoch": 1.4200913242009132,
"grad_norm": 5.085892677307129,
"learning_rate": 8.698630136986302e-06,
"loss": 0.7768,
"step": 31100
},
{
"epoch": 1.4223744292237444,
"grad_norm": 52.48747634887695,
"learning_rate": 8.664383561643837e-06,
"loss": 0.687,
"step": 31150
},
{
"epoch": 1.4246575342465753,
"grad_norm": 1.8491209745407104,
"learning_rate": 8.63013698630137e-06,
"loss": 0.5669,
"step": 31200
},
{
"epoch": 1.4269406392694064,
"grad_norm": 12.143204689025879,
"learning_rate": 8.595890410958905e-06,
"loss": 0.5568,
"step": 31250
},
{
"epoch": 1.4292237442922375,
"grad_norm": 2.939903974533081,
"learning_rate": 8.56164383561644e-06,
"loss": 0.7732,
"step": 31300
},
{
"epoch": 1.4315068493150684,
"grad_norm": 17.773130416870117,
"learning_rate": 8.527397260273974e-06,
"loss": 0.7444,
"step": 31350
},
{
"epoch": 1.4337899543378996,
"grad_norm": 47.27958679199219,
"learning_rate": 8.493150684931507e-06,
"loss": 0.6621,
"step": 31400
},
{
"epoch": 1.4360730593607305,
"grad_norm": 50.40327453613281,
"learning_rate": 8.458904109589042e-06,
"loss": 0.84,
"step": 31450
},
{
"epoch": 1.4383561643835616,
"grad_norm": 8.335402488708496,
"learning_rate": 8.424657534246577e-06,
"loss": 0.6762,
"step": 31500
},
{
"epoch": 1.4406392694063928,
"grad_norm": 12.027316093444824,
"learning_rate": 8.390410958904111e-06,
"loss": 0.6736,
"step": 31550
},
{
"epoch": 1.4429223744292237,
"grad_norm": 17.410192489624023,
"learning_rate": 8.356164383561643e-06,
"loss": 0.5072,
"step": 31600
},
{
"epoch": 1.4452054794520548,
"grad_norm": 48.263450622558594,
"learning_rate": 8.321917808219178e-06,
"loss": 0.6268,
"step": 31650
},
{
"epoch": 1.4474885844748857,
"grad_norm": 3.8568694591522217,
"learning_rate": 8.287671232876712e-06,
"loss": 0.5454,
"step": 31700
},
{
"epoch": 1.4497716894977168,
"grad_norm": 13.764704704284668,
"learning_rate": 8.253424657534247e-06,
"loss": 0.6823,
"step": 31750
},
{
"epoch": 1.452054794520548,
"grad_norm": 13.48620319366455,
"learning_rate": 8.21917808219178e-06,
"loss": 0.7103,
"step": 31800
},
{
"epoch": 1.454337899543379,
"grad_norm": 17.291501998901367,
"learning_rate": 8.184931506849315e-06,
"loss": 0.7011,
"step": 31850
},
{
"epoch": 1.45662100456621,
"grad_norm": 1.461418867111206,
"learning_rate": 8.15068493150685e-06,
"loss": 0.6667,
"step": 31900
},
{
"epoch": 1.4589041095890412,
"grad_norm": 16.34942626953125,
"learning_rate": 8.116438356164384e-06,
"loss": 0.7885,
"step": 31950
},
{
"epoch": 1.461187214611872,
"grad_norm": 14.74634075164795,
"learning_rate": 8.082191780821917e-06,
"loss": 0.644,
"step": 32000
},
{
"epoch": 1.4634703196347032,
"grad_norm": 6.794888973236084,
"learning_rate": 8.047945205479452e-06,
"loss": 0.6738,
"step": 32050
},
{
"epoch": 1.4657534246575343,
"grad_norm": 31.303226470947266,
"learning_rate": 8.013698630136987e-06,
"loss": 0.8235,
"step": 32100
},
{
"epoch": 1.4680365296803652,
"grad_norm": 42.993648529052734,
"learning_rate": 7.979452054794521e-06,
"loss": 0.4712,
"step": 32150
},
{
"epoch": 1.4703196347031964,
"grad_norm": 7.875132083892822,
"learning_rate": 7.945205479452055e-06,
"loss": 0.5951,
"step": 32200
},
{
"epoch": 1.4726027397260273,
"grad_norm": 9.124963760375977,
"learning_rate": 7.91095890410959e-06,
"loss": 0.5968,
"step": 32250
},
{
"epoch": 1.4748858447488584,
"grad_norm": 13.793811798095703,
"learning_rate": 7.876712328767124e-06,
"loss": 0.6484,
"step": 32300
},
{
"epoch": 1.4771689497716896,
"grad_norm": 2.1718921661376953,
"learning_rate": 7.842465753424659e-06,
"loss": 0.6279,
"step": 32350
},
{
"epoch": 1.4794520547945205,
"grad_norm": 60.621543884277344,
"learning_rate": 7.808219178082192e-06,
"loss": 0.6718,
"step": 32400
},
{
"epoch": 1.4817351598173516,
"grad_norm": 6.748918533325195,
"learning_rate": 7.773972602739727e-06,
"loss": 0.6333,
"step": 32450
},
{
"epoch": 1.4840182648401825,
"grad_norm": 10.061300277709961,
"learning_rate": 7.739726027397261e-06,
"loss": 0.5952,
"step": 32500
},
{
"epoch": 1.4863013698630136,
"grad_norm": 55.56220245361328,
"learning_rate": 7.705479452054796e-06,
"loss": 0.6297,
"step": 32550
},
{
"epoch": 1.4885844748858448,
"grad_norm": 31.07186508178711,
"learning_rate": 7.671232876712329e-06,
"loss": 0.5302,
"step": 32600
},
{
"epoch": 1.490867579908676,
"grad_norm": 30.925626754760742,
"learning_rate": 7.636986301369864e-06,
"loss": 0.6608,
"step": 32650
},
{
"epoch": 1.4931506849315068,
"grad_norm": 21.15188217163086,
"learning_rate": 7.602739726027398e-06,
"loss": 0.6838,
"step": 32700
},
{
"epoch": 1.495433789954338,
"grad_norm": 15.808161735534668,
"learning_rate": 7.568493150684932e-06,
"loss": 0.5405,
"step": 32750
},
{
"epoch": 1.4977168949771689,
"grad_norm": 11.866249084472656,
"learning_rate": 7.5342465753424655e-06,
"loss": 0.6924,
"step": 32800
},
{
"epoch": 1.5,
"grad_norm": 29.02684783935547,
"learning_rate": 7.5e-06,
"loss": 0.6354,
"step": 32850
},
{
"epoch": 1.5022831050228311,
"grad_norm": 20.26506996154785,
"learning_rate": 7.465753424657534e-06,
"loss": 0.6467,
"step": 32900
},
{
"epoch": 1.5045662100456623,
"grad_norm": 23.63490867614746,
"learning_rate": 7.431506849315069e-06,
"loss": 0.6682,
"step": 32950
},
{
"epoch": 1.5068493150684932,
"grad_norm": 16.075380325317383,
"learning_rate": 7.397260273972603e-06,
"loss": 0.6922,
"step": 33000
},
{
"epoch": 1.509132420091324,
"grad_norm": 14.159255027770996,
"learning_rate": 7.3630136986301374e-06,
"loss": 0.6063,
"step": 33050
},
{
"epoch": 1.5114155251141552,
"grad_norm": 22.143796920776367,
"learning_rate": 7.328767123287671e-06,
"loss": 0.7155,
"step": 33100
},
{
"epoch": 1.5136986301369864,
"grad_norm": 98.22097778320312,
"learning_rate": 7.294520547945206e-06,
"loss": 0.7567,
"step": 33150
},
{
"epoch": 1.5159817351598175,
"grad_norm": 0.7336256504058838,
"learning_rate": 7.260273972602739e-06,
"loss": 0.5414,
"step": 33200
},
{
"epoch": 1.5182648401826484,
"grad_norm": 0.3773713707923889,
"learning_rate": 7.226027397260274e-06,
"loss": 0.5702,
"step": 33250
},
{
"epoch": 1.5205479452054793,
"grad_norm": 8.909625053405762,
"learning_rate": 7.191780821917808e-06,
"loss": 0.7228,
"step": 33300
},
{
"epoch": 1.5228310502283104,
"grad_norm": 21.098960876464844,
"learning_rate": 7.1575342465753425e-06,
"loss": 0.6001,
"step": 33350
},
{
"epoch": 1.5251141552511416,
"grad_norm": 15.906450271606445,
"learning_rate": 7.123287671232876e-06,
"loss": 0.6303,
"step": 33400
},
{
"epoch": 1.5273972602739727,
"grad_norm": 24.9348201751709,
"learning_rate": 7.089041095890411e-06,
"loss": 0.673,
"step": 33450
},
{
"epoch": 1.5296803652968036,
"grad_norm": 8.255683898925781,
"learning_rate": 7.054794520547945e-06,
"loss": 0.5831,
"step": 33500
},
{
"epoch": 1.5319634703196348,
"grad_norm": 73.46847534179688,
"learning_rate": 7.02054794520548e-06,
"loss": 0.7362,
"step": 33550
},
{
"epoch": 1.5342465753424657,
"grad_norm": 88.85016632080078,
"learning_rate": 6.986301369863014e-06,
"loss": 0.6682,
"step": 33600
},
{
"epoch": 1.5365296803652968,
"grad_norm": 52.53008270263672,
"learning_rate": 6.952054794520548e-06,
"loss": 0.5667,
"step": 33650
},
{
"epoch": 1.538812785388128,
"grad_norm": 18.00398826599121,
"learning_rate": 6.917808219178082e-06,
"loss": 0.6214,
"step": 33700
},
{
"epoch": 1.541095890410959,
"grad_norm": 27.124656677246094,
"learning_rate": 6.883561643835617e-06,
"loss": 0.6737,
"step": 33750
},
{
"epoch": 1.54337899543379,
"grad_norm": 39.45083999633789,
"learning_rate": 6.849315068493151e-06,
"loss": 0.703,
"step": 33800
},
{
"epoch": 1.545662100456621,
"grad_norm": 0.20495979487895966,
"learning_rate": 6.815068493150685e-06,
"loss": 0.6271,
"step": 33850
},
{
"epoch": 1.547945205479452,
"grad_norm": 0.8208453059196472,
"learning_rate": 6.780821917808219e-06,
"loss": 0.6804,
"step": 33900
},
{
"epoch": 1.5502283105022832,
"grad_norm": 12.416110038757324,
"learning_rate": 6.746575342465753e-06,
"loss": 0.7786,
"step": 33950
},
{
"epoch": 1.5525114155251143,
"grad_norm": 21.69839096069336,
"learning_rate": 6.712328767123287e-06,
"loss": 0.7048,
"step": 34000
},
{
"epoch": 1.5547945205479452,
"grad_norm": 57.30881881713867,
"learning_rate": 6.678082191780822e-06,
"loss": 0.7398,
"step": 34050
},
{
"epoch": 1.5570776255707761,
"grad_norm": 22.58492088317871,
"learning_rate": 6.643835616438356e-06,
"loss": 0.7027,
"step": 34100
},
{
"epoch": 1.5593607305936072,
"grad_norm": 8.803092002868652,
"learning_rate": 6.609589041095891e-06,
"loss": 0.7296,
"step": 34150
},
{
"epoch": 1.5616438356164384,
"grad_norm": 18.931156158447266,
"learning_rate": 6.5753424657534245e-06,
"loss": 0.7622,
"step": 34200
},
{
"epoch": 1.5639269406392695,
"grad_norm": 8.858073234558105,
"learning_rate": 6.541095890410959e-06,
"loss": 0.615,
"step": 34250
},
{
"epoch": 1.5662100456621004,
"grad_norm": 6.284381866455078,
"learning_rate": 6.506849315068493e-06,
"loss": 0.7147,
"step": 34300
},
{
"epoch": 1.5684931506849316,
"grad_norm": 21.08570098876953,
"learning_rate": 6.472602739726028e-06,
"loss": 0.6153,
"step": 34350
},
{
"epoch": 1.5707762557077625,
"grad_norm": 2.0850419998168945,
"learning_rate": 6.438356164383562e-06,
"loss": 0.588,
"step": 34400
},
{
"epoch": 1.5730593607305936,
"grad_norm": 0.25530076026916504,
"learning_rate": 6.4041095890410965e-06,
"loss": 0.7249,
"step": 34450
},
{
"epoch": 1.5753424657534247,
"grad_norm": 31.284807205200195,
"learning_rate": 6.3698630136986296e-06,
"loss": 0.6106,
"step": 34500
},
{
"epoch": 1.5776255707762559,
"grad_norm": 19.524412155151367,
"learning_rate": 6.335616438356164e-06,
"loss": 0.7259,
"step": 34550
},
{
"epoch": 1.5799086757990868,
"grad_norm": 6.005446910858154,
"learning_rate": 6.301369863013698e-06,
"loss": 0.5231,
"step": 34600
},
{
"epoch": 1.5821917808219177,
"grad_norm": 17.577402114868164,
"learning_rate": 6.267123287671233e-06,
"loss": 0.847,
"step": 34650
},
{
"epoch": 1.5844748858447488,
"grad_norm": 31.817855834960938,
"learning_rate": 6.232876712328767e-06,
"loss": 0.5509,
"step": 34700
},
{
"epoch": 1.58675799086758,
"grad_norm": 4.6908063888549805,
"learning_rate": 6.1986301369863016e-06,
"loss": 0.752,
"step": 34750
},
{
"epoch": 1.589041095890411,
"grad_norm": 2.8228561878204346,
"learning_rate": 6.1643835616438354e-06,
"loss": 0.7156,
"step": 34800
},
{
"epoch": 1.591324200913242,
"grad_norm": 7.878891468048096,
"learning_rate": 6.13013698630137e-06,
"loss": 0.6926,
"step": 34850
},
{
"epoch": 1.593607305936073,
"grad_norm": 30.530006408691406,
"learning_rate": 6.095890410958904e-06,
"loss": 0.7552,
"step": 34900
},
{
"epoch": 1.595890410958904,
"grad_norm": 29.396806716918945,
"learning_rate": 6.061643835616439e-06,
"loss": 0.7008,
"step": 34950
},
{
"epoch": 1.5981735159817352,
"grad_norm": 10.500929832458496,
"learning_rate": 6.027397260273973e-06,
"loss": 0.4952,
"step": 35000
},
{
"epoch": 1.6004566210045663,
"grad_norm": 2.337519407272339,
"learning_rate": 5.9931506849315074e-06,
"loss": 0.5864,
"step": 35050
},
{
"epoch": 1.6027397260273972,
"grad_norm": 8.646376609802246,
"learning_rate": 5.958904109589041e-06,
"loss": 0.7454,
"step": 35100
},
{
"epoch": 1.6050228310502284,
"grad_norm": 17.153099060058594,
"learning_rate": 5.924657534246575e-06,
"loss": 0.6943,
"step": 35150
},
{
"epoch": 1.6073059360730593,
"grad_norm": 25.350088119506836,
"learning_rate": 5.890410958904109e-06,
"loss": 0.6803,
"step": 35200
},
{
"epoch": 1.6095890410958904,
"grad_norm": 4.12929105758667,
"learning_rate": 5.856164383561644e-06,
"loss": 0.6976,
"step": 35250
},
{
"epoch": 1.6118721461187215,
"grad_norm": 14.61955451965332,
"learning_rate": 5.821917808219178e-06,
"loss": 0.7487,
"step": 35300
},
{
"epoch": 1.6141552511415527,
"grad_norm": 6.208589553833008,
"learning_rate": 5.7876712328767125e-06,
"loss": 0.6569,
"step": 35350
},
{
"epoch": 1.6164383561643836,
"grad_norm": 9.5521240234375,
"learning_rate": 5.753424657534246e-06,
"loss": 0.6505,
"step": 35400
},
{
"epoch": 1.6187214611872145,
"grad_norm": 14.391396522521973,
"learning_rate": 5.719178082191781e-06,
"loss": 0.5558,
"step": 35450
},
{
"epoch": 1.6210045662100456,
"grad_norm": 1.2627131938934326,
"learning_rate": 5.684931506849315e-06,
"loss": 0.6405,
"step": 35500
},
{
"epoch": 1.6232876712328768,
"grad_norm": 83.7956314086914,
"learning_rate": 5.65068493150685e-06,
"loss": 0.5483,
"step": 35550
},
{
"epoch": 1.625570776255708,
"grad_norm": 15.18497085571289,
"learning_rate": 5.616438356164384e-06,
"loss": 0.6692,
"step": 35600
},
{
"epoch": 1.6278538812785388,
"grad_norm": 33.16044998168945,
"learning_rate": 5.582191780821918e-06,
"loss": 0.6939,
"step": 35650
},
{
"epoch": 1.6301369863013697,
"grad_norm": 12.063103675842285,
"learning_rate": 5.547945205479452e-06,
"loss": 0.5414,
"step": 35700
},
{
"epoch": 1.6324200913242009,
"grad_norm": 26.803749084472656,
"learning_rate": 5.513698630136986e-06,
"loss": 0.5353,
"step": 35750
},
{
"epoch": 1.634703196347032,
"grad_norm": 6.5856523513793945,
"learning_rate": 5.47945205479452e-06,
"loss": 0.7678,
"step": 35800
},
{
"epoch": 1.6369863013698631,
"grad_norm": 14.661989212036133,
"learning_rate": 5.445205479452055e-06,
"loss": 0.67,
"step": 35850
},
{
"epoch": 1.639269406392694,
"grad_norm": 2.4577255249023438,
"learning_rate": 5.410958904109589e-06,
"loss": 0.4579,
"step": 35900
},
{
"epoch": 1.6415525114155252,
"grad_norm": 17.849546432495117,
"learning_rate": 5.376712328767123e-06,
"loss": 0.7221,
"step": 35950
},
{
"epoch": 1.643835616438356,
"grad_norm": 6.453017234802246,
"learning_rate": 5.342465753424657e-06,
"loss": 0.6889,
"step": 36000
},
{
"epoch": 1.6461187214611872,
"grad_norm": 15.138044357299805,
"learning_rate": 5.308219178082192e-06,
"loss": 0.6234,
"step": 36050
},
{
"epoch": 1.6484018264840183,
"grad_norm": 11.393730163574219,
"learning_rate": 5.273972602739726e-06,
"loss": 0.6353,
"step": 36100
},
{
"epoch": 1.6506849315068495,
"grad_norm": 2.726991891860962,
"learning_rate": 5.239726027397261e-06,
"loss": 0.5305,
"step": 36150
},
{
"epoch": 1.6529680365296804,
"grad_norm": 14.54566478729248,
"learning_rate": 5.2054794520547945e-06,
"loss": 0.8252,
"step": 36200
},
{
"epoch": 1.6552511415525113,
"grad_norm": 0.952422022819519,
"learning_rate": 5.171232876712329e-06,
"loss": 0.6773,
"step": 36250
},
{
"epoch": 1.6575342465753424,
"grad_norm": 21.44168472290039,
"learning_rate": 5.136986301369863e-06,
"loss": 0.6106,
"step": 36300
},
{
"epoch": 1.6598173515981736,
"grad_norm": 11.663095474243164,
"learning_rate": 5.102739726027398e-06,
"loss": 0.6514,
"step": 36350
},
{
"epoch": 1.6621004566210047,
"grad_norm": 23.589557647705078,
"learning_rate": 5.068493150684931e-06,
"loss": 0.708,
"step": 36400
},
{
"epoch": 1.6643835616438356,
"grad_norm": 10.283199310302734,
"learning_rate": 5.034246575342466e-06,
"loss": 0.5563,
"step": 36450
},
{
"epoch": 1.6666666666666665,
"grad_norm": 44.115047454833984,
"learning_rate": 4.9999999999999996e-06,
"loss": 0.5912,
"step": 36500
},
{
"epoch": 1.6689497716894977,
"grad_norm": 71.8247299194336,
"learning_rate": 4.965753424657534e-06,
"loss": 0.7993,
"step": 36550
},
{
"epoch": 1.6712328767123288,
"grad_norm": 11.808229446411133,
"learning_rate": 4.931506849315068e-06,
"loss": 0.5851,
"step": 36600
},
{
"epoch": 1.67351598173516,
"grad_norm": 74.73955535888672,
"learning_rate": 4.897260273972603e-06,
"loss": 0.7004,
"step": 36650
},
{
"epoch": 1.6757990867579908,
"grad_norm": 36.06229019165039,
"learning_rate": 4.863013698630137e-06,
"loss": 0.7382,
"step": 36700
},
{
"epoch": 1.678082191780822,
"grad_norm": 53.29566955566406,
"learning_rate": 4.8287671232876716e-06,
"loss": 0.5035,
"step": 36750
},
{
"epoch": 1.6803652968036529,
"grad_norm": 21.9272403717041,
"learning_rate": 4.7945205479452054e-06,
"loss": 0.5309,
"step": 36800
},
{
"epoch": 1.682648401826484,
"grad_norm": 39.56712341308594,
"learning_rate": 4.76027397260274e-06,
"loss": 0.4754,
"step": 36850
},
{
"epoch": 1.6849315068493151,
"grad_norm": 2.814680576324463,
"learning_rate": 4.726027397260274e-06,
"loss": 0.7033,
"step": 36900
},
{
"epoch": 1.6872146118721463,
"grad_norm": 95.83110809326172,
"learning_rate": 4.691780821917809e-06,
"loss": 0.6999,
"step": 36950
},
{
"epoch": 1.6894977168949772,
"grad_norm": 27.638185501098633,
"learning_rate": 4.657534246575343e-06,
"loss": 0.6522,
"step": 37000
},
{
"epoch": 1.691780821917808,
"grad_norm": 10.899153709411621,
"learning_rate": 4.623287671232877e-06,
"loss": 0.6386,
"step": 37050
},
{
"epoch": 1.6940639269406392,
"grad_norm": 21.410276412963867,
"learning_rate": 4.5890410958904105e-06,
"loss": 0.695,
"step": 37100
},
{
"epoch": 1.6963470319634704,
"grad_norm": 15.208582878112793,
"learning_rate": 4.554794520547945e-06,
"loss": 0.7636,
"step": 37150
},
{
"epoch": 1.6986301369863015,
"grad_norm": 19.083850860595703,
"learning_rate": 4.520547945205479e-06,
"loss": 0.6331,
"step": 37200
},
{
"epoch": 1.7009132420091324,
"grad_norm": 4.408557415008545,
"learning_rate": 4.486301369863014e-06,
"loss": 0.6886,
"step": 37250
},
{
"epoch": 1.7031963470319633,
"grad_norm": 10.206310272216797,
"learning_rate": 4.452054794520548e-06,
"loss": 0.639,
"step": 37300
},
{
"epoch": 1.7054794520547945,
"grad_norm": 18.985891342163086,
"learning_rate": 4.4178082191780825e-06,
"loss": 0.6884,
"step": 37350
},
{
"epoch": 1.7077625570776256,
"grad_norm": 16.533288955688477,
"learning_rate": 4.383561643835616e-06,
"loss": 0.6446,
"step": 37400
},
{
"epoch": 1.7100456621004567,
"grad_norm": 25.728469848632812,
"learning_rate": 4.349315068493151e-06,
"loss": 0.4863,
"step": 37450
},
{
"epoch": 1.7123287671232876,
"grad_norm": 88.2020492553711,
"learning_rate": 4.315068493150685e-06,
"loss": 0.6831,
"step": 37500
},
{
"epoch": 1.7146118721461188,
"grad_norm": 44.737815856933594,
"learning_rate": 4.28082191780822e-06,
"loss": 0.4687,
"step": 37550
},
{
"epoch": 1.7168949771689497,
"grad_norm": 1.9043503999710083,
"learning_rate": 4.246575342465754e-06,
"loss": 0.6608,
"step": 37600
},
{
"epoch": 1.7191780821917808,
"grad_norm": 11.180625915527344,
"learning_rate": 4.212328767123288e-06,
"loss": 0.7672,
"step": 37650
},
{
"epoch": 1.721461187214612,
"grad_norm": 3.448392391204834,
"learning_rate": 4.178082191780821e-06,
"loss": 0.6176,
"step": 37700
},
{
"epoch": 1.723744292237443,
"grad_norm": 10.672887802124023,
"learning_rate": 4.143835616438356e-06,
"loss": 0.5263,
"step": 37750
},
{
"epoch": 1.726027397260274,
"grad_norm": 15.69261360168457,
"learning_rate": 4.10958904109589e-06,
"loss": 0.4637,
"step": 37800
},
{
"epoch": 1.728310502283105,
"grad_norm": 19.786346435546875,
"learning_rate": 4.075342465753425e-06,
"loss": 0.4554,
"step": 37850
},
{
"epoch": 1.730593607305936,
"grad_norm": 31.991483688354492,
"learning_rate": 4.041095890410959e-06,
"loss": 0.561,
"step": 37900
},
{
"epoch": 1.7328767123287672,
"grad_norm": 21.96062469482422,
"learning_rate": 4.006849315068493e-06,
"loss": 0.5969,
"step": 37950
},
{
"epoch": 1.7351598173515983,
"grad_norm": 10.800865173339844,
"learning_rate": 3.972602739726027e-06,
"loss": 0.6058,
"step": 38000
},
{
"epoch": 1.7374429223744292,
"grad_norm": 10.979826927185059,
"learning_rate": 3.938356164383562e-06,
"loss": 0.6034,
"step": 38050
},
{
"epoch": 1.7397260273972601,
"grad_norm": 41.3328742980957,
"learning_rate": 3.904109589041096e-06,
"loss": 0.5687,
"step": 38100
},
{
"epoch": 1.7420091324200913,
"grad_norm": 38.379608154296875,
"learning_rate": 3.869863013698631e-06,
"loss": 0.6931,
"step": 38150
},
{
"epoch": 1.7442922374429224,
"grad_norm": 3.292733907699585,
"learning_rate": 3.8356164383561645e-06,
"loss": 0.4162,
"step": 38200
},
{
"epoch": 1.7465753424657535,
"grad_norm": 18.47883415222168,
"learning_rate": 3.801369863013699e-06,
"loss": 0.6574,
"step": 38250
},
{
"epoch": 1.7488584474885844,
"grad_norm": 10.917158126831055,
"learning_rate": 3.7671232876712327e-06,
"loss": 0.6617,
"step": 38300
},
{
"epoch": 1.7511415525114156,
"grad_norm": 13.783547401428223,
"learning_rate": 3.732876712328767e-06,
"loss": 0.7701,
"step": 38350
},
{
"epoch": 1.7534246575342465,
"grad_norm": 21.937267303466797,
"learning_rate": 3.6986301369863014e-06,
"loss": 0.7627,
"step": 38400
},
{
"epoch": 1.7557077625570776,
"grad_norm": 15.421838760375977,
"learning_rate": 3.6643835616438357e-06,
"loss": 0.6636,
"step": 38450
},
{
"epoch": 1.7579908675799087,
"grad_norm": 14.788371086120605,
"learning_rate": 3.6301369863013696e-06,
"loss": 0.5593,
"step": 38500
},
{
"epoch": 1.7602739726027399,
"grad_norm": 5.76630163192749,
"learning_rate": 3.595890410958904e-06,
"loss": 0.5112,
"step": 38550
},
{
"epoch": 1.7625570776255708,
"grad_norm": 23.72429656982422,
"learning_rate": 3.561643835616438e-06,
"loss": 0.5729,
"step": 38600
},
{
"epoch": 1.7648401826484017,
"grad_norm": 18.512802124023438,
"learning_rate": 3.5273972602739725e-06,
"loss": 0.6827,
"step": 38650
},
{
"epoch": 1.7671232876712328,
"grad_norm": 4.7244720458984375,
"learning_rate": 3.493150684931507e-06,
"loss": 0.6428,
"step": 38700
},
{
"epoch": 1.769406392694064,
"grad_norm": 6.208735466003418,
"learning_rate": 3.458904109589041e-06,
"loss": 0.5986,
"step": 38750
},
{
"epoch": 1.771689497716895,
"grad_norm": 2.6915433406829834,
"learning_rate": 3.4246575342465754e-06,
"loss": 0.5905,
"step": 38800
},
{
"epoch": 1.773972602739726,
"grad_norm": 4.2726969718933105,
"learning_rate": 3.3904109589041093e-06,
"loss": 0.564,
"step": 38850
},
{
"epoch": 1.776255707762557,
"grad_norm": 26.4520263671875,
"learning_rate": 3.3561643835616436e-06,
"loss": 0.5738,
"step": 38900
},
{
"epoch": 1.778538812785388,
"grad_norm": 43.63593673706055,
"learning_rate": 3.321917808219178e-06,
"loss": 0.621,
"step": 38950
},
{
"epoch": 1.7808219178082192,
"grad_norm": 4.847127437591553,
"learning_rate": 3.2876712328767123e-06,
"loss": 0.8252,
"step": 39000
},
{
"epoch": 1.7831050228310503,
"grad_norm": 7.686138153076172,
"learning_rate": 3.2534246575342466e-06,
"loss": 0.7211,
"step": 39050
},
{
"epoch": 1.7853881278538812,
"grad_norm": 42.92139434814453,
"learning_rate": 3.219178082191781e-06,
"loss": 0.7652,
"step": 39100
},
{
"epoch": 1.7876712328767124,
"grad_norm": 57.05276107788086,
"learning_rate": 3.1849315068493148e-06,
"loss": 0.5234,
"step": 39150
},
{
"epoch": 1.7899543378995433,
"grad_norm": 18.587209701538086,
"learning_rate": 3.150684931506849e-06,
"loss": 0.457,
"step": 39200
},
{
"epoch": 1.7922374429223744,
"grad_norm": 7.19858455657959,
"learning_rate": 3.1164383561643834e-06,
"loss": 0.8795,
"step": 39250
},
{
"epoch": 1.7945205479452055,
"grad_norm": 10.892264366149902,
"learning_rate": 3.0821917808219177e-06,
"loss": 0.7042,
"step": 39300
},
{
"epoch": 1.7968036529680367,
"grad_norm": 28.82424545288086,
"learning_rate": 3.047945205479452e-06,
"loss": 0.6396,
"step": 39350
},
{
"epoch": 1.7990867579908676,
"grad_norm": 7.087406158447266,
"learning_rate": 3.0136986301369864e-06,
"loss": 0.6665,
"step": 39400
},
{
"epoch": 1.8013698630136985,
"grad_norm": 22.56847381591797,
"learning_rate": 2.9794520547945207e-06,
"loss": 0.7265,
"step": 39450
},
{
"epoch": 1.8036529680365296,
"grad_norm": 18.845949172973633,
"learning_rate": 2.9452054794520546e-06,
"loss": 0.6475,
"step": 39500
},
{
"epoch": 1.8059360730593608,
"grad_norm": 26.794076919555664,
"learning_rate": 2.910958904109589e-06,
"loss": 0.7632,
"step": 39550
},
{
"epoch": 1.808219178082192,
"grad_norm": 0.44524723291397095,
"learning_rate": 2.876712328767123e-06,
"loss": 0.4843,
"step": 39600
},
{
"epoch": 1.8105022831050228,
"grad_norm": 45.64598083496094,
"learning_rate": 2.8424657534246575e-06,
"loss": 0.5918,
"step": 39650
},
{
"epoch": 1.8127853881278537,
"grad_norm": 75.41986846923828,
"learning_rate": 2.808219178082192e-06,
"loss": 0.6352,
"step": 39700
},
{
"epoch": 1.8150684931506849,
"grad_norm": 10.345170974731445,
"learning_rate": 2.773972602739726e-06,
"loss": 0.6204,
"step": 39750
},
{
"epoch": 1.817351598173516,
"grad_norm": 11.58834171295166,
"learning_rate": 2.73972602739726e-06,
"loss": 0.5325,
"step": 39800
},
{
"epoch": 1.8196347031963471,
"grad_norm": 14.26885986328125,
"learning_rate": 2.7054794520547943e-06,
"loss": 0.5008,
"step": 39850
},
{
"epoch": 1.821917808219178,
"grad_norm": 4.937170505523682,
"learning_rate": 2.6712328767123286e-06,
"loss": 0.7317,
"step": 39900
},
{
"epoch": 1.8242009132420092,
"grad_norm": 27.849742889404297,
"learning_rate": 2.636986301369863e-06,
"loss": 0.6825,
"step": 39950
},
{
"epoch": 1.82648401826484,
"grad_norm": 38.649810791015625,
"learning_rate": 2.6027397260273973e-06,
"loss": 0.4422,
"step": 40000
},
{
"epoch": 1.8287671232876712,
"grad_norm": 52.95954895019531,
"learning_rate": 2.5684931506849316e-06,
"loss": 0.747,
"step": 40050
},
{
"epoch": 1.8310502283105023,
"grad_norm": 10.486088752746582,
"learning_rate": 2.5342465753424655e-06,
"loss": 0.6757,
"step": 40100
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.9142507314682007,
"learning_rate": 2.4999999999999998e-06,
"loss": 0.712,
"step": 40150
},
{
"epoch": 1.8356164383561644,
"grad_norm": 51.62909698486328,
"learning_rate": 2.465753424657534e-06,
"loss": 0.6107,
"step": 40200
},
{
"epoch": 1.8378995433789953,
"grad_norm": 31.60240936279297,
"learning_rate": 2.4315068493150684e-06,
"loss": 0.601,
"step": 40250
},
{
"epoch": 1.8401826484018264,
"grad_norm": 55.16636657714844,
"learning_rate": 2.3972602739726027e-06,
"loss": 0.6121,
"step": 40300
},
{
"epoch": 1.8424657534246576,
"grad_norm": 34.450416564941406,
"learning_rate": 2.363013698630137e-06,
"loss": 0.6331,
"step": 40350
},
{
"epoch": 1.8447488584474887,
"grad_norm": 27.42693328857422,
"learning_rate": 2.3287671232876713e-06,
"loss": 0.733,
"step": 40400
},
{
"epoch": 1.8470319634703196,
"grad_norm": 44.26624298095703,
"learning_rate": 2.2945205479452052e-06,
"loss": 0.7453,
"step": 40450
},
{
"epoch": 1.8493150684931505,
"grad_norm": 16.07997703552246,
"learning_rate": 2.2602739726027396e-06,
"loss": 0.6628,
"step": 40500
},
{
"epoch": 1.8515981735159817,
"grad_norm": 2.816776752471924,
"learning_rate": 2.226027397260274e-06,
"loss": 0.4991,
"step": 40550
},
{
"epoch": 1.8538812785388128,
"grad_norm": 4.0401434898376465,
"learning_rate": 2.191780821917808e-06,
"loss": 0.6943,
"step": 40600
},
{
"epoch": 1.856164383561644,
"grad_norm": 5.525669097900391,
"learning_rate": 2.1575342465753425e-06,
"loss": 0.6887,
"step": 40650
},
{
"epoch": 1.8584474885844748,
"grad_norm": 85.989990234375,
"learning_rate": 2.123287671232877e-06,
"loss": 0.7425,
"step": 40700
},
{
"epoch": 1.860730593607306,
"grad_norm": 4.5465779304504395,
"learning_rate": 2.0890410958904107e-06,
"loss": 0.7462,
"step": 40750
},
{
"epoch": 1.8630136986301369,
"grad_norm": 5.920977592468262,
"learning_rate": 2.054794520547945e-06,
"loss": 0.5546,
"step": 40800
},
{
"epoch": 1.865296803652968,
"grad_norm": 12.154388427734375,
"learning_rate": 2.0205479452054793e-06,
"loss": 0.6693,
"step": 40850
},
{
"epoch": 1.8675799086757991,
"grad_norm": 16.800073623657227,
"learning_rate": 1.9863013698630136e-06,
"loss": 0.7908,
"step": 40900
},
{
"epoch": 1.8698630136986303,
"grad_norm": 43.42325973510742,
"learning_rate": 1.952054794520548e-06,
"loss": 0.5155,
"step": 40950
},
{
"epoch": 1.8721461187214612,
"grad_norm": 29.5067138671875,
"learning_rate": 1.9178082191780823e-06,
"loss": 0.7588,
"step": 41000
},
{
"epoch": 1.874429223744292,
"grad_norm": 28.01750946044922,
"learning_rate": 1.8835616438356164e-06,
"loss": 0.6622,
"step": 41050
},
{
"epoch": 1.8767123287671232,
"grad_norm": 16.869781494140625,
"learning_rate": 1.8493150684931507e-06,
"loss": 0.6737,
"step": 41100
},
{
"epoch": 1.8789954337899544,
"grad_norm": 8.377634048461914,
"learning_rate": 1.8150684931506848e-06,
"loss": 0.6281,
"step": 41150
},
{
"epoch": 1.8812785388127855,
"grad_norm": 16.61414337158203,
"learning_rate": 1.780821917808219e-06,
"loss": 0.6251,
"step": 41200
},
{
"epoch": 1.8835616438356164,
"grad_norm": 16.144508361816406,
"learning_rate": 1.7465753424657534e-06,
"loss": 0.6607,
"step": 41250
},
{
"epoch": 1.8858447488584473,
"grad_norm": 20.15201759338379,
"learning_rate": 1.7123287671232877e-06,
"loss": 0.5207,
"step": 41300
},
{
"epoch": 1.8881278538812785,
"grad_norm": 7.15456485748291,
"learning_rate": 1.6780821917808218e-06,
"loss": 0.5882,
"step": 41350
},
{
"epoch": 1.8904109589041096,
"grad_norm": 17.336624145507812,
"learning_rate": 1.6438356164383561e-06,
"loss": 0.556,
"step": 41400
},
{
"epoch": 1.8926940639269407,
"grad_norm": 20.451026916503906,
"learning_rate": 1.6095890410958904e-06,
"loss": 0.6011,
"step": 41450
},
{
"epoch": 1.8949771689497716,
"grad_norm": 33.44941329956055,
"learning_rate": 1.5753424657534245e-06,
"loss": 0.5166,
"step": 41500
},
{
"epoch": 1.8972602739726028,
"grad_norm": 24.176786422729492,
"learning_rate": 1.5410958904109589e-06,
"loss": 0.6837,
"step": 41550
},
{
"epoch": 1.8995433789954337,
"grad_norm": 17.142606735229492,
"learning_rate": 1.5068493150684932e-06,
"loss": 0.5813,
"step": 41600
},
{
"epoch": 1.9018264840182648,
"grad_norm": 34.20349884033203,
"learning_rate": 1.4726027397260273e-06,
"loss": 0.786,
"step": 41650
},
{
"epoch": 1.904109589041096,
"grad_norm": 1.5305472612380981,
"learning_rate": 1.4383561643835616e-06,
"loss": 0.5928,
"step": 41700
},
{
"epoch": 1.906392694063927,
"grad_norm": 1.1295257806777954,
"learning_rate": 1.404109589041096e-06,
"loss": 0.7055,
"step": 41750
},
{
"epoch": 1.908675799086758,
"grad_norm": 23.80326271057129,
"learning_rate": 1.36986301369863e-06,
"loss": 0.5861,
"step": 41800
},
{
"epoch": 1.910958904109589,
"grad_norm": 3.346529960632324,
"learning_rate": 1.3356164383561643e-06,
"loss": 0.6864,
"step": 41850
},
{
"epoch": 1.91324200913242,
"grad_norm": 34.07392883300781,
"learning_rate": 1.3013698630136986e-06,
"loss": 0.6754,
"step": 41900
},
{
"epoch": 1.9155251141552512,
"grad_norm": 42.87485122680664,
"learning_rate": 1.2671232876712327e-06,
"loss": 0.613,
"step": 41950
},
{
"epoch": 1.9178082191780823,
"grad_norm": 9.337113380432129,
"learning_rate": 1.232876712328767e-06,
"loss": 0.5302,
"step": 42000
},
{
"epoch": 1.9200913242009132,
"grad_norm": 19.920682907104492,
"learning_rate": 1.1986301369863014e-06,
"loss": 0.4405,
"step": 42050
},
{
"epoch": 1.9223744292237441,
"grad_norm": 24.49388313293457,
"learning_rate": 1.1643835616438357e-06,
"loss": 0.696,
"step": 42100
},
{
"epoch": 1.9246575342465753,
"grad_norm": 7.732158184051514,
"learning_rate": 1.1301369863013698e-06,
"loss": 0.758,
"step": 42150
},
{
"epoch": 1.9269406392694064,
"grad_norm": 6.940062046051025,
"learning_rate": 1.095890410958904e-06,
"loss": 0.6804,
"step": 42200
},
{
"epoch": 1.9292237442922375,
"grad_norm": 1.062066674232483,
"learning_rate": 1.0616438356164384e-06,
"loss": 0.6059,
"step": 42250
},
{
"epoch": 1.9315068493150684,
"grad_norm": 20.240144729614258,
"learning_rate": 1.0273972602739725e-06,
"loss": 0.7992,
"step": 42300
},
{
"epoch": 1.9337899543378996,
"grad_norm": 19.4890193939209,
"learning_rate": 9.931506849315068e-07,
"loss": 0.6943,
"step": 42350
},
{
"epoch": 1.9360730593607305,
"grad_norm": 25.273487091064453,
"learning_rate": 9.589041095890411e-07,
"loss": 0.5925,
"step": 42400
},
{
"epoch": 1.9383561643835616,
"grad_norm": 21.916284561157227,
"learning_rate": 9.246575342465753e-07,
"loss": 0.6607,
"step": 42450
},
{
"epoch": 1.9406392694063928,
"grad_norm": 38.107566833496094,
"learning_rate": 8.904109589041095e-07,
"loss": 0.5807,
"step": 42500
},
{
"epoch": 1.9429223744292239,
"grad_norm": 8.979408264160156,
"learning_rate": 8.561643835616439e-07,
"loss": 0.6391,
"step": 42550
},
{
"epoch": 1.9452054794520548,
"grad_norm": 20.871389389038086,
"learning_rate": 8.219178082191781e-07,
"loss": 0.764,
"step": 42600
},
{
"epoch": 1.9474885844748857,
"grad_norm": 19.994056701660156,
"learning_rate": 7.876712328767123e-07,
"loss": 0.7959,
"step": 42650
},
{
"epoch": 1.9497716894977168,
"grad_norm": 25.47404670715332,
"learning_rate": 7.534246575342466e-07,
"loss": 0.5579,
"step": 42700
},
{
"epoch": 1.952054794520548,
"grad_norm": 10.598165512084961,
"learning_rate": 7.191780821917808e-07,
"loss": 0.8362,
"step": 42750
},
{
"epoch": 1.954337899543379,
"grad_norm": 1.3319069147109985,
"learning_rate": 6.84931506849315e-07,
"loss": 0.8132,
"step": 42800
},
{
"epoch": 1.95662100456621,
"grad_norm": 4.061497211456299,
"learning_rate": 6.506849315068493e-07,
"loss": 0.5364,
"step": 42850
},
{
"epoch": 1.958904109589041,
"grad_norm": 3.6196768283843994,
"learning_rate": 6.164383561643835e-07,
"loss": 0.57,
"step": 42900
},
{
"epoch": 1.961187214611872,
"grad_norm": 26.95933723449707,
"learning_rate": 5.821917808219178e-07,
"loss": 0.7364,
"step": 42950
},
{
"epoch": 1.9634703196347032,
"grad_norm": 25.18138313293457,
"learning_rate": 5.47945205479452e-07,
"loss": 0.5801,
"step": 43000
},
{
"epoch": 1.9657534246575343,
"grad_norm": 77.7193374633789,
"learning_rate": 5.136986301369863e-07,
"loss": 0.6965,
"step": 43050
},
{
"epoch": 1.9680365296803652,
"grad_norm": 10.553460121154785,
"learning_rate": 4.794520547945206e-07,
"loss": 0.6019,
"step": 43100
},
{
"epoch": 1.9703196347031964,
"grad_norm": 14.633034706115723,
"learning_rate": 4.4520547945205477e-07,
"loss": 0.5948,
"step": 43150
},
{
"epoch": 1.9726027397260273,
"grad_norm": 10.625903129577637,
"learning_rate": 4.1095890410958903e-07,
"loss": 0.5868,
"step": 43200
},
{
"epoch": 1.9748858447488584,
"grad_norm": 11.625406265258789,
"learning_rate": 3.767123287671233e-07,
"loss": 0.7335,
"step": 43250
},
{
"epoch": 1.9771689497716896,
"grad_norm": 11.875858306884766,
"learning_rate": 3.424657534246575e-07,
"loss": 0.6305,
"step": 43300
},
{
"epoch": 1.9794520547945207,
"grad_norm": 65.91475677490234,
"learning_rate": 3.0821917808219176e-07,
"loss": 0.5038,
"step": 43350
},
{
"epoch": 1.9817351598173516,
"grad_norm": 7.164722442626953,
"learning_rate": 2.73972602739726e-07,
"loss": 0.6426,
"step": 43400
},
{
"epoch": 1.9840182648401825,
"grad_norm": 19.229677200317383,
"learning_rate": 2.397260273972603e-07,
"loss": 0.7671,
"step": 43450
},
{
"epoch": 1.9863013698630136,
"grad_norm": 9.592227935791016,
"learning_rate": 2.0547945205479452e-07,
"loss": 0.7229,
"step": 43500
},
{
"epoch": 1.9885844748858448,
"grad_norm": 23.81863021850586,
"learning_rate": 1.7123287671232875e-07,
"loss": 0.5238,
"step": 43550
},
{
"epoch": 1.990867579908676,
"grad_norm": 30.481460571289062,
"learning_rate": 1.36986301369863e-07,
"loss": 0.5882,
"step": 43600
},
{
"epoch": 1.9931506849315068,
"grad_norm": 3.194218397140503,
"learning_rate": 1.0273972602739726e-07,
"loss": 0.5594,
"step": 43650
},
{
"epoch": 1.9954337899543377,
"grad_norm": 32.75349807739258,
"learning_rate": 6.84931506849315e-08,
"loss": 0.7056,
"step": 43700
},
{
"epoch": 1.9977168949771689,
"grad_norm": 37.13774490356445,
"learning_rate": 3.424657534246575e-08,
"loss": 0.7017,
"step": 43750
},
{
"epoch": 2.0,
"grad_norm": 38.73429489135742,
"learning_rate": 0.0,
"loss": 0.6852,
"step": 43800
},
{
"epoch": 2.0,
"step": 43800,
"total_flos": 3.864501390676132e+17,
"train_loss": 0.05644623499482734,
"train_runtime": 4615.1991,
"train_samples_per_second": 37.961,
"train_steps_per_second": 9.49
}
],
"logging_steps": 50,
"max_steps": 43800,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.864501390676132e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}