{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999747481124214, "eval_steps": 500, "global_step": 19800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00050503775157193, "grad_norm": 13.943564139476521, "learning_rate": 9.996464289322155e-06, "loss": 1.7553, "step": 10 }, { "epoch": 0.00101007550314386, "grad_norm": 5.789553200928907, "learning_rate": 9.991413274068087e-06, "loss": 0.8176, "step": 20 }, { "epoch": 0.00151511325471579, "grad_norm": 7.735975598755883, "learning_rate": 9.986362258814021e-06, "loss": 0.7018, "step": 30 }, { "epoch": 0.00202015100628772, "grad_norm": 3.5851607839184143, "learning_rate": 9.981311243559957e-06, "loss": 0.6599, "step": 40 }, { "epoch": 0.00252518875785965, "grad_norm": 6.237435656827707, "learning_rate": 9.976260228305891e-06, "loss": 0.6038, "step": 50 }, { "epoch": 0.00303022650943158, "grad_norm": 13.568356180772243, "learning_rate": 9.971209213051824e-06, "loss": 0.6027, "step": 60 }, { "epoch": 0.00353526426100351, "grad_norm": 3.235105055773784, "learning_rate": 9.966158197797758e-06, "loss": 0.5893, "step": 70 }, { "epoch": 0.00404030201257544, "grad_norm": 5.862991115468846, "learning_rate": 9.961107182543692e-06, "loss": 0.5863, "step": 80 }, { "epoch": 0.00454533976414737, "grad_norm": 3.3977361363891334, "learning_rate": 9.956056167289626e-06, "loss": 0.5736, "step": 90 }, { "epoch": 0.0050503775157193, "grad_norm": 4.281339414007124, "learning_rate": 9.95100515203556e-06, "loss": 0.5508, "step": 100 }, { "epoch": 0.00555541526729123, "grad_norm": 3.144694292761742, "learning_rate": 9.945954136781494e-06, "loss": 0.5493, "step": 110 }, { "epoch": 0.00606045301886316, "grad_norm": 3.0898861803222566, "learning_rate": 9.940903121527427e-06, "loss": 0.5417, "step": 120 }, { "epoch": 0.00656549077043509, "grad_norm": 14.79188542424222, "learning_rate": 9.935852106273361e-06, "loss": 0.5444, "step": 130 }, { "epoch": 0.00707052852200702, "grad_norm": 3.493197693686729, "learning_rate": 9.930801091019295e-06, "loss": 0.5284, "step": 140 }, { "epoch": 0.00757556627357895, "grad_norm": 10.7485482232531, "learning_rate": 9.92575007576523e-06, "loss": 0.53, "step": 150 }, { "epoch": 0.00808060402515088, "grad_norm": 4.095456564556807, "learning_rate": 9.920699060511163e-06, "loss": 0.5171, "step": 160 }, { "epoch": 0.00858564177672281, "grad_norm": 18.07561627235401, "learning_rate": 9.915648045257097e-06, "loss": 0.5293, "step": 170 }, { "epoch": 0.00909067952829474, "grad_norm": 14.881912561895916, "learning_rate": 9.910597030003032e-06, "loss": 0.525, "step": 180 }, { "epoch": 0.00959571727986667, "grad_norm": 3.077548490509973, "learning_rate": 9.905546014748966e-06, "loss": 0.4952, "step": 190 }, { "epoch": 0.0101007550314386, "grad_norm": 5.40203767482295, "learning_rate": 9.9004949994949e-06, "loss": 0.5172, "step": 200 }, { "epoch": 0.01060579278301053, "grad_norm": 12.22425699537504, "learning_rate": 9.895443984240834e-06, "loss": 0.5358, "step": 210 }, { "epoch": 0.01111083053458246, "grad_norm": 7.830646098334567, "learning_rate": 9.890392968986768e-06, "loss": 0.5102, "step": 220 }, { "epoch": 0.01161586828615439, "grad_norm": 3.814940763160932, "learning_rate": 9.8853419537327e-06, "loss": 0.4948, "step": 230 }, { "epoch": 0.01212090603772632, "grad_norm": 9.822466847144137, "learning_rate": 9.880290938478635e-06, "loss": 0.5217, "step": 240 }, { "epoch": 0.01262594378929825, "grad_norm": 46.6756643508386, "learning_rate": 9.875239923224569e-06, "loss": 0.5024, "step": 250 }, { "epoch": 0.01313098154087018, "grad_norm": 2.706287487769574, "learning_rate": 9.870188907970503e-06, "loss": 0.5161, "step": 260 }, { "epoch": 0.01363601929244211, "grad_norm": 2.211316891350979, "learning_rate": 9.865137892716437e-06, "loss": 0.5112, "step": 270 }, { "epoch": 0.01414105704401404, "grad_norm": 2.9091819978240663, "learning_rate": 9.860086877462371e-06, "loss": 0.5166, "step": 280 }, { "epoch": 0.01464609479558597, "grad_norm": 3.745541781549167, "learning_rate": 9.855035862208304e-06, "loss": 0.5051, "step": 290 }, { "epoch": 0.0151511325471579, "grad_norm": 2.8340353132271394, "learning_rate": 9.849984846954238e-06, "loss": 0.5075, "step": 300 }, { "epoch": 0.01565617029872983, "grad_norm": 3.869953185267734, "learning_rate": 9.844933831700174e-06, "loss": 0.5216, "step": 310 }, { "epoch": 0.01616120805030176, "grad_norm": 2.211016455052966, "learning_rate": 9.839882816446108e-06, "loss": 0.4973, "step": 320 }, { "epoch": 0.01666624580187369, "grad_norm": 3.3086106494157606, "learning_rate": 9.83483180119204e-06, "loss": 0.5023, "step": 330 }, { "epoch": 0.01717128355344562, "grad_norm": 3.680801950407282, "learning_rate": 9.829780785937974e-06, "loss": 0.4933, "step": 340 }, { "epoch": 0.01767632130501755, "grad_norm": 2.7546156381855935, "learning_rate": 9.824729770683908e-06, "loss": 0.5176, "step": 350 }, { "epoch": 0.01818135905658948, "grad_norm": 5.87533624997546, "learning_rate": 9.819678755429842e-06, "loss": 0.5014, "step": 360 }, { "epoch": 0.01868639680816141, "grad_norm": 5.806811873538911, "learning_rate": 9.814627740175777e-06, "loss": 0.5103, "step": 370 }, { "epoch": 0.01919143455973334, "grad_norm": 2.3849830922620137, "learning_rate": 9.80957672492171e-06, "loss": 0.5133, "step": 380 }, { "epoch": 0.01969647231130527, "grad_norm": 7.2668450331901715, "learning_rate": 9.804525709667643e-06, "loss": 0.5049, "step": 390 }, { "epoch": 0.0202015100628772, "grad_norm": 2.5520502646771264, "learning_rate": 9.799474694413577e-06, "loss": 0.5102, "step": 400 }, { "epoch": 0.02070654781444913, "grad_norm": 2.473700390480294, "learning_rate": 9.794423679159511e-06, "loss": 0.4825, "step": 410 }, { "epoch": 0.02121158556602106, "grad_norm": 3.648682107473895, "learning_rate": 9.789372663905445e-06, "loss": 0.4983, "step": 420 }, { "epoch": 0.02171662331759299, "grad_norm": 3.458879333685983, "learning_rate": 9.78432164865138e-06, "loss": 0.4912, "step": 430 }, { "epoch": 0.02222166106916492, "grad_norm": 3.0729192822937503, "learning_rate": 9.779270633397314e-06, "loss": 0.4905, "step": 440 }, { "epoch": 0.02272669882073685, "grad_norm": 17.505451267102025, "learning_rate": 9.774219618143248e-06, "loss": 0.4988, "step": 450 }, { "epoch": 0.02323173657230878, "grad_norm": 11.347644323036132, "learning_rate": 9.769168602889182e-06, "loss": 0.527, "step": 460 }, { "epoch": 0.02373677432388071, "grad_norm": 3.745152344082931, "learning_rate": 9.764117587635116e-06, "loss": 0.4751, "step": 470 }, { "epoch": 0.02424181207545264, "grad_norm": 3.430436888183069, "learning_rate": 9.75906657238105e-06, "loss": 0.4925, "step": 480 }, { "epoch": 0.02474684982702457, "grad_norm": 2.5106033133425316, "learning_rate": 9.754015557126983e-06, "loss": 0.4909, "step": 490 }, { "epoch": 0.0252518875785965, "grad_norm": 2.9444873201490345, "learning_rate": 9.748964541872917e-06, "loss": 0.5046, "step": 500 }, { "epoch": 0.02575692533016843, "grad_norm": 2.2226598370186355, "learning_rate": 9.743913526618851e-06, "loss": 0.5019, "step": 510 }, { "epoch": 0.02626196308174036, "grad_norm": 2.1431364606222143, "learning_rate": 9.738862511364785e-06, "loss": 0.4978, "step": 520 }, { "epoch": 0.02676700083331229, "grad_norm": 1.9687420001979, "learning_rate": 9.733811496110719e-06, "loss": 0.5168, "step": 530 }, { "epoch": 0.02727203858488422, "grad_norm": 1.790704686386335, "learning_rate": 9.728760480856653e-06, "loss": 0.4918, "step": 540 }, { "epoch": 0.02777707633645615, "grad_norm": 1.5122456879041994, "learning_rate": 9.723709465602587e-06, "loss": 0.5015, "step": 550 }, { "epoch": 0.02828211408802808, "grad_norm": 2.155745175549288, "learning_rate": 9.71865845034852e-06, "loss": 0.4958, "step": 560 }, { "epoch": 0.02878715183960001, "grad_norm": 2.2999303399374753, "learning_rate": 9.713607435094454e-06, "loss": 0.494, "step": 570 }, { "epoch": 0.02929218959117194, "grad_norm": 1.9957939818387918, "learning_rate": 9.70855641984039e-06, "loss": 0.4772, "step": 580 }, { "epoch": 0.02979722734274387, "grad_norm": 2.7790545722653137, "learning_rate": 9.703505404586324e-06, "loss": 0.4958, "step": 590 }, { "epoch": 0.0303022650943158, "grad_norm": 1.8902194621075885, "learning_rate": 9.698454389332256e-06, "loss": 0.4811, "step": 600 }, { "epoch": 0.03080730284588773, "grad_norm": 2.330090449709308, "learning_rate": 9.69340337407819e-06, "loss": 0.4955, "step": 610 }, { "epoch": 0.03131234059745966, "grad_norm": 2.63971178528007, "learning_rate": 9.688352358824125e-06, "loss": 0.481, "step": 620 }, { "epoch": 0.03181737834903159, "grad_norm": 3.9470995700087093, "learning_rate": 9.683301343570059e-06, "loss": 0.4911, "step": 630 }, { "epoch": 0.03232241610060352, "grad_norm": 3.705578063273332, "learning_rate": 9.678250328315993e-06, "loss": 0.4707, "step": 640 }, { "epoch": 0.03282745385217545, "grad_norm": 2.583661159495941, "learning_rate": 9.673199313061927e-06, "loss": 0.4599, "step": 650 }, { "epoch": 0.03333249160374738, "grad_norm": 2.0174541627393743, "learning_rate": 9.66814829780786e-06, "loss": 0.4941, "step": 660 }, { "epoch": 0.03383752935531931, "grad_norm": 4.576484208609804, "learning_rate": 9.663097282553793e-06, "loss": 0.4839, "step": 670 }, { "epoch": 0.03434256710689124, "grad_norm": 5.516602752547856, "learning_rate": 9.658046267299728e-06, "loss": 0.4652, "step": 680 }, { "epoch": 0.03484760485846317, "grad_norm": 1.9835012286897338, "learning_rate": 9.652995252045662e-06, "loss": 0.4573, "step": 690 }, { "epoch": 0.0353526426100351, "grad_norm": 2.3068456370975596, "learning_rate": 9.647944236791596e-06, "loss": 0.4939, "step": 700 }, { "epoch": 0.03585768036160703, "grad_norm": 2.504718982689086, "learning_rate": 9.64289322153753e-06, "loss": 0.479, "step": 710 }, { "epoch": 0.03636271811317896, "grad_norm": 3.0659241109345885, "learning_rate": 9.637842206283464e-06, "loss": 0.4668, "step": 720 }, { "epoch": 0.03686775586475089, "grad_norm": 5.822499608800306, "learning_rate": 9.632791191029398e-06, "loss": 0.4776, "step": 730 }, { "epoch": 0.03737279361632282, "grad_norm": 2.282902491093137, "learning_rate": 9.627740175775332e-06, "loss": 0.4809, "step": 740 }, { "epoch": 0.03787783136789475, "grad_norm": 3.6160296320382623, "learning_rate": 9.622689160521266e-06, "loss": 0.4772, "step": 750 }, { "epoch": 0.03838286911946668, "grad_norm": 2.168648546785388, "learning_rate": 9.617638145267199e-06, "loss": 0.4727, "step": 760 }, { "epoch": 0.03888790687103861, "grad_norm": 2.3243624027787746, "learning_rate": 9.612587130013133e-06, "loss": 0.4725, "step": 770 }, { "epoch": 0.03939294462261054, "grad_norm": 2.302262458969543, "learning_rate": 9.607536114759067e-06, "loss": 0.4866, "step": 780 }, { "epoch": 0.03989798237418247, "grad_norm": 8.258585402548862, "learning_rate": 9.602485099505001e-06, "loss": 0.4732, "step": 790 }, { "epoch": 0.0404030201257544, "grad_norm": 2.1689541135562007, "learning_rate": 9.597434084250935e-06, "loss": 0.4761, "step": 800 }, { "epoch": 0.04090805787732633, "grad_norm": 2.087988553230154, "learning_rate": 9.59238306899687e-06, "loss": 0.4755, "step": 810 }, { "epoch": 0.04141309562889826, "grad_norm": 2.2958375828670965, "learning_rate": 9.587332053742802e-06, "loss": 0.4688, "step": 820 }, { "epoch": 0.04191813338047019, "grad_norm": 2.3117097400641446, "learning_rate": 9.582281038488736e-06, "loss": 0.4635, "step": 830 }, { "epoch": 0.04242317113204212, "grad_norm": 8.93438709495605, "learning_rate": 9.57723002323467e-06, "loss": 0.4615, "step": 840 }, { "epoch": 0.04292820888361405, "grad_norm": 2.1041537246773134, "learning_rate": 9.572179007980606e-06, "loss": 0.493, "step": 850 }, { "epoch": 0.04343324663518598, "grad_norm": 1.73155627574332, "learning_rate": 9.567127992726538e-06, "loss": 0.4531, "step": 860 }, { "epoch": 0.04393828438675791, "grad_norm": 1.9302062705761438, "learning_rate": 9.562076977472473e-06, "loss": 0.4644, "step": 870 }, { "epoch": 0.04444332213832984, "grad_norm": 1.9821994690328757, "learning_rate": 9.557025962218407e-06, "loss": 0.4418, "step": 880 }, { "epoch": 0.04494835988990177, "grad_norm": 2.5134313739910716, "learning_rate": 9.55197494696434e-06, "loss": 0.4556, "step": 890 }, { "epoch": 0.0454533976414737, "grad_norm": 2.448160093260678, "learning_rate": 9.546923931710275e-06, "loss": 0.4867, "step": 900 }, { "epoch": 0.04595843539304563, "grad_norm": 3.6293868003589456, "learning_rate": 9.541872916456209e-06, "loss": 0.4575, "step": 910 }, { "epoch": 0.04646347314461756, "grad_norm": 5.740958333853467, "learning_rate": 9.536821901202143e-06, "loss": 0.4602, "step": 920 }, { "epoch": 0.04696851089618949, "grad_norm": 2.6003779275117282, "learning_rate": 9.531770885948076e-06, "loss": 0.4631, "step": 930 }, { "epoch": 0.04747354864776142, "grad_norm": 2.412862217111322, "learning_rate": 9.52671987069401e-06, "loss": 0.4765, "step": 940 }, { "epoch": 0.04797858639933335, "grad_norm": 6.6681363062446035, "learning_rate": 9.521668855439944e-06, "loss": 0.4658, "step": 950 }, { "epoch": 0.04848362415090528, "grad_norm": 5.764214860916012, "learning_rate": 9.516617840185878e-06, "loss": 0.4615, "step": 960 }, { "epoch": 0.04898866190247721, "grad_norm": 2.464119785527778, "learning_rate": 9.511566824931812e-06, "loss": 0.4496, "step": 970 }, { "epoch": 0.04949369965404914, "grad_norm": 6.238087195845154, "learning_rate": 9.506515809677746e-06, "loss": 0.4431, "step": 980 }, { "epoch": 0.04999873740562107, "grad_norm": 5.932767508185511, "learning_rate": 9.50146479442368e-06, "loss": 0.4574, "step": 990 }, { "epoch": 0.050503775157193, "grad_norm": 9.775572888274276, "learning_rate": 9.496413779169614e-06, "loss": 0.4596, "step": 1000 }, { "epoch": 0.05100881290876493, "grad_norm": 6.307739027249706, "learning_rate": 9.491362763915549e-06, "loss": 0.4515, "step": 1010 }, { "epoch": 0.05151385066033686, "grad_norm": 3.187729331697054, "learning_rate": 9.486311748661483e-06, "loss": 0.4713, "step": 1020 }, { "epoch": 0.05201888841190879, "grad_norm": 5.592090620849129, "learning_rate": 9.481260733407415e-06, "loss": 0.454, "step": 1030 }, { "epoch": 0.05252392616348072, "grad_norm": 5.281749256921326, "learning_rate": 9.47620971815335e-06, "loss": 0.4513, "step": 1040 }, { "epoch": 0.05302896391505265, "grad_norm": 3.415542870576186, "learning_rate": 9.471158702899283e-06, "loss": 0.4584, "step": 1050 }, { "epoch": 0.05353400166662458, "grad_norm": 3.4909280444023034, "learning_rate": 9.466107687645218e-06, "loss": 0.4491, "step": 1060 }, { "epoch": 0.05403903941819651, "grad_norm": 4.87148376874038, "learning_rate": 9.461056672391152e-06, "loss": 0.4672, "step": 1070 }, { "epoch": 0.05454407716976844, "grad_norm": 5.691263676353072, "learning_rate": 9.456005657137086e-06, "loss": 0.4398, "step": 1080 }, { "epoch": 0.05504911492134037, "grad_norm": 7.086742552442662, "learning_rate": 9.450954641883018e-06, "loss": 0.4657, "step": 1090 }, { "epoch": 0.0555541526729123, "grad_norm": 3.941589115392049, "learning_rate": 9.445903626628952e-06, "loss": 0.4559, "step": 1100 }, { "epoch": 0.05605919042448423, "grad_norm": 11.794328327356654, "learning_rate": 9.440852611374886e-06, "loss": 0.4436, "step": 1110 }, { "epoch": 0.05656422817605616, "grad_norm": 9.586300666497886, "learning_rate": 9.435801596120822e-06, "loss": 0.4581, "step": 1120 }, { "epoch": 0.05706926592762809, "grad_norm": 3.5060354246476484, "learning_rate": 9.430750580866755e-06, "loss": 0.4414, "step": 1130 }, { "epoch": 0.05757430367920002, "grad_norm": 2.9623447600620136, "learning_rate": 9.425699565612689e-06, "loss": 0.4475, "step": 1140 }, { "epoch": 0.05807934143077195, "grad_norm": 10.925832762761535, "learning_rate": 9.420648550358623e-06, "loss": 0.4454, "step": 1150 }, { "epoch": 0.05858437918234388, "grad_norm": 3.275733802006869, "learning_rate": 9.415597535104557e-06, "loss": 0.4349, "step": 1160 }, { "epoch": 0.05908941693391581, "grad_norm": 4.379989620996673, "learning_rate": 9.410546519850491e-06, "loss": 0.455, "step": 1170 }, { "epoch": 0.05959445468548774, "grad_norm": 2.1574051089026893, "learning_rate": 9.405495504596425e-06, "loss": 0.4551, "step": 1180 }, { "epoch": 0.06009949243705967, "grad_norm": 2.5566027220209344, "learning_rate": 9.400444489342358e-06, "loss": 0.4496, "step": 1190 }, { "epoch": 0.0606045301886316, "grad_norm": 3.3866623066830153, "learning_rate": 9.395393474088292e-06, "loss": 0.4705, "step": 1200 }, { "epoch": 0.06110956794020353, "grad_norm": 3.166133789377687, "learning_rate": 9.390342458834226e-06, "loss": 0.4556, "step": 1210 }, { "epoch": 0.06161460569177546, "grad_norm": 3.91861184581623, "learning_rate": 9.38529144358016e-06, "loss": 0.4673, "step": 1220 }, { "epoch": 0.06211964344334739, "grad_norm": 3.380841845875404, "learning_rate": 9.380240428326094e-06, "loss": 0.4551, "step": 1230 }, { "epoch": 0.06262468119491932, "grad_norm": 8.214890466142487, "learning_rate": 9.375189413072028e-06, "loss": 0.4436, "step": 1240 }, { "epoch": 0.06312971894649125, "grad_norm": 3.5096622602562904, "learning_rate": 9.37013839781796e-06, "loss": 0.4455, "step": 1250 }, { "epoch": 0.06363475669806318, "grad_norm": 3.773235111611304, "learning_rate": 9.365087382563897e-06, "loss": 0.4622, "step": 1260 }, { "epoch": 0.0641397944496351, "grad_norm": 6.295753580294585, "learning_rate": 9.36003636730983e-06, "loss": 0.4602, "step": 1270 }, { "epoch": 0.06464483220120704, "grad_norm": 2.725162214074834, "learning_rate": 9.354985352055765e-06, "loss": 0.4597, "step": 1280 }, { "epoch": 0.06514986995277897, "grad_norm": 2.52312004043461, "learning_rate": 9.349934336801699e-06, "loss": 0.4617, "step": 1290 }, { "epoch": 0.0656549077043509, "grad_norm": 2.789794392473564, "learning_rate": 9.344883321547631e-06, "loss": 0.4505, "step": 1300 }, { "epoch": 0.06615994545592283, "grad_norm": 3.952687566204272, "learning_rate": 9.339832306293566e-06, "loss": 0.4488, "step": 1310 }, { "epoch": 0.06666498320749475, "grad_norm": 2.4979475288074284, "learning_rate": 9.3347812910395e-06, "loss": 0.436, "step": 1320 }, { "epoch": 0.06717002095906668, "grad_norm": 3.3362793627660685, "learning_rate": 9.329730275785434e-06, "loss": 0.4739, "step": 1330 }, { "epoch": 0.06767505871063861, "grad_norm": 2.4147254145567834, "learning_rate": 9.324679260531368e-06, "loss": 0.4627, "step": 1340 }, { "epoch": 0.06818009646221056, "grad_norm": 5.672214495427731, "learning_rate": 9.319628245277302e-06, "loss": 0.451, "step": 1350 }, { "epoch": 0.06868513421378249, "grad_norm": 1.8726502280944801, "learning_rate": 9.314577230023234e-06, "loss": 0.4417, "step": 1360 }, { "epoch": 0.06919017196535442, "grad_norm": 3.973145301581678, "learning_rate": 9.309526214769169e-06, "loss": 0.4253, "step": 1370 }, { "epoch": 0.06969520971692635, "grad_norm": 5.073872511256849, "learning_rate": 9.304475199515103e-06, "loss": 0.4408, "step": 1380 }, { "epoch": 0.07020024746849828, "grad_norm": 7.525570163399166, "learning_rate": 9.299424184261039e-06, "loss": 0.4307, "step": 1390 }, { "epoch": 0.0707052852200702, "grad_norm": 2.781161378212436, "learning_rate": 9.294373169006971e-06, "loss": 0.4536, "step": 1400 }, { "epoch": 0.07121032297164213, "grad_norm": 2.090988491048501, "learning_rate": 9.289322153752905e-06, "loss": 0.4583, "step": 1410 }, { "epoch": 0.07171536072321406, "grad_norm": 2.4437267649640715, "learning_rate": 9.28427113849884e-06, "loss": 0.4579, "step": 1420 }, { "epoch": 0.072220398474786, "grad_norm": 2.2076648371457033, "learning_rate": 9.279220123244773e-06, "loss": 0.4343, "step": 1430 }, { "epoch": 0.07272543622635792, "grad_norm": 1.8617491045028525, "learning_rate": 9.274169107990707e-06, "loss": 0.4509, "step": 1440 }, { "epoch": 0.07323047397792985, "grad_norm": 2.222879920477197, "learning_rate": 9.269118092736642e-06, "loss": 0.4577, "step": 1450 }, { "epoch": 0.07373551172950178, "grad_norm": 2.5321185321413626, "learning_rate": 9.264067077482574e-06, "loss": 0.4391, "step": 1460 }, { "epoch": 0.07424054948107371, "grad_norm": 1.8760025526162245, "learning_rate": 9.259016062228508e-06, "loss": 0.474, "step": 1470 }, { "epoch": 0.07474558723264564, "grad_norm": 4.085966137442041, "learning_rate": 9.253965046974442e-06, "loss": 0.4444, "step": 1480 }, { "epoch": 0.07525062498421757, "grad_norm": 3.6849288920231547, "learning_rate": 9.248914031720376e-06, "loss": 0.4631, "step": 1490 }, { "epoch": 0.0757556627357895, "grad_norm": 2.0812547682300395, "learning_rate": 9.24386301646631e-06, "loss": 0.4429, "step": 1500 }, { "epoch": 0.07626070048736143, "grad_norm": 2.1885414591057923, "learning_rate": 9.238812001212245e-06, "loss": 0.4453, "step": 1510 }, { "epoch": 0.07676573823893336, "grad_norm": 2.3972513453985234, "learning_rate": 9.233760985958177e-06, "loss": 0.4541, "step": 1520 }, { "epoch": 0.07727077599050529, "grad_norm": 1.715103982949988, "learning_rate": 9.228709970704113e-06, "loss": 0.4218, "step": 1530 }, { "epoch": 0.07777581374207722, "grad_norm": 2.825431193796437, "learning_rate": 9.223658955450047e-06, "loss": 0.4552, "step": 1540 }, { "epoch": 0.07828085149364915, "grad_norm": 4.126140659488343, "learning_rate": 9.218607940195981e-06, "loss": 0.4379, "step": 1550 }, { "epoch": 0.07878588924522108, "grad_norm": 4.979196187347629, "learning_rate": 9.213556924941914e-06, "loss": 0.4649, "step": 1560 }, { "epoch": 0.079290926996793, "grad_norm": 5.8119930612941735, "learning_rate": 9.208505909687848e-06, "loss": 0.4474, "step": 1570 }, { "epoch": 0.07979596474836494, "grad_norm": 2.5598139471551864, "learning_rate": 9.203454894433782e-06, "loss": 0.4645, "step": 1580 }, { "epoch": 0.08030100249993687, "grad_norm": 2.9414494655541112, "learning_rate": 9.198403879179716e-06, "loss": 0.4431, "step": 1590 }, { "epoch": 0.0808060402515088, "grad_norm": 2.518508877017779, "learning_rate": 9.19335286392565e-06, "loss": 0.4513, "step": 1600 }, { "epoch": 0.08131107800308073, "grad_norm": 4.2677352209415425, "learning_rate": 9.188301848671584e-06, "loss": 0.4362, "step": 1610 }, { "epoch": 0.08181611575465265, "grad_norm": 2.376190662511493, "learning_rate": 9.183250833417518e-06, "loss": 0.4419, "step": 1620 }, { "epoch": 0.08232115350622458, "grad_norm": 2.311824528121457, "learning_rate": 9.17819981816345e-06, "loss": 0.4543, "step": 1630 }, { "epoch": 0.08282619125779651, "grad_norm": 2.8591324038418207, "learning_rate": 9.173148802909385e-06, "loss": 0.4449, "step": 1640 }, { "epoch": 0.08333122900936846, "grad_norm": 2.0096705761911737, "learning_rate": 9.168097787655319e-06, "loss": 0.4413, "step": 1650 }, { "epoch": 0.08383626676094039, "grad_norm": 2.536437573585815, "learning_rate": 9.163046772401255e-06, "loss": 0.4527, "step": 1660 }, { "epoch": 0.08434130451251232, "grad_norm": 1.974773461430328, "learning_rate": 9.157995757147187e-06, "loss": 0.4471, "step": 1670 }, { "epoch": 0.08484634226408425, "grad_norm": 3.0369504080501755, "learning_rate": 9.152944741893121e-06, "loss": 0.4523, "step": 1680 }, { "epoch": 0.08535138001565618, "grad_norm": 2.3986445472475455, "learning_rate": 9.147893726639055e-06, "loss": 0.4367, "step": 1690 }, { "epoch": 0.0858564177672281, "grad_norm": 2.808593335583457, "learning_rate": 9.14284271138499e-06, "loss": 0.4465, "step": 1700 }, { "epoch": 0.08636145551880003, "grad_norm": 2.516839276650598, "learning_rate": 9.137791696130924e-06, "loss": 0.4492, "step": 1710 }, { "epoch": 0.08686649327037196, "grad_norm": 2.458679996734928, "learning_rate": 9.132740680876858e-06, "loss": 0.4387, "step": 1720 }, { "epoch": 0.0873715310219439, "grad_norm": 2.691340501537917, "learning_rate": 9.12768966562279e-06, "loss": 0.4591, "step": 1730 }, { "epoch": 0.08787656877351582, "grad_norm": 2.2677007816655244, "learning_rate": 9.122638650368724e-06, "loss": 0.4463, "step": 1740 }, { "epoch": 0.08838160652508775, "grad_norm": 2.0671070241912797, "learning_rate": 9.117587635114659e-06, "loss": 0.4307, "step": 1750 }, { "epoch": 0.08888664427665968, "grad_norm": 2.7605610064361596, "learning_rate": 9.112536619860593e-06, "loss": 0.4391, "step": 1760 }, { "epoch": 0.08939168202823161, "grad_norm": 2.437909002135904, "learning_rate": 9.107485604606527e-06, "loss": 0.4526, "step": 1770 }, { "epoch": 0.08989671977980354, "grad_norm": 2.99196689672201, "learning_rate": 9.102434589352461e-06, "loss": 0.4593, "step": 1780 }, { "epoch": 0.09040175753137547, "grad_norm": 1.8141907580964987, "learning_rate": 9.097383574098393e-06, "loss": 0.4611, "step": 1790 }, { "epoch": 0.0909067952829474, "grad_norm": 2.018729489961064, "learning_rate": 9.092332558844329e-06, "loss": 0.4594, "step": 1800 }, { "epoch": 0.09141183303451933, "grad_norm": 1.9703582862843025, "learning_rate": 9.087281543590263e-06, "loss": 0.4382, "step": 1810 }, { "epoch": 0.09191687078609126, "grad_norm": 1.6921508201701456, "learning_rate": 9.082230528336197e-06, "loss": 0.4561, "step": 1820 }, { "epoch": 0.09242190853766319, "grad_norm": 3.029420346852276, "learning_rate": 9.07717951308213e-06, "loss": 0.4441, "step": 1830 }, { "epoch": 0.09292694628923512, "grad_norm": 1.9103397185841393, "learning_rate": 9.072128497828064e-06, "loss": 0.4345, "step": 1840 }, { "epoch": 0.09343198404080705, "grad_norm": 1.6003619276106154, "learning_rate": 9.067077482573998e-06, "loss": 0.454, "step": 1850 }, { "epoch": 0.09393702179237898, "grad_norm": 2.039612541605303, "learning_rate": 9.062026467319932e-06, "loss": 0.4472, "step": 1860 }, { "epoch": 0.09444205954395091, "grad_norm": 1.9475921005254484, "learning_rate": 9.056975452065866e-06, "loss": 0.4568, "step": 1870 }, { "epoch": 0.09494709729552284, "grad_norm": 1.556252720211933, "learning_rate": 9.0519244368118e-06, "loss": 0.4462, "step": 1880 }, { "epoch": 0.09545213504709477, "grad_norm": 2.041709870164745, "learning_rate": 9.046873421557733e-06, "loss": 0.4522, "step": 1890 }, { "epoch": 0.0959571727986667, "grad_norm": 1.6042104819290137, "learning_rate": 9.041822406303667e-06, "loss": 0.4358, "step": 1900 }, { "epoch": 0.09646221055023863, "grad_norm": 1.7827733522561613, "learning_rate": 9.036771391049601e-06, "loss": 0.4541, "step": 1910 }, { "epoch": 0.09696724830181055, "grad_norm": 1.5335434779957613, "learning_rate": 9.031720375795535e-06, "loss": 0.462, "step": 1920 }, { "epoch": 0.09747228605338248, "grad_norm": 1.5961467635458402, "learning_rate": 9.02666936054147e-06, "loss": 0.444, "step": 1930 }, { "epoch": 0.09797732380495441, "grad_norm": 2.6325242433089637, "learning_rate": 9.021618345287403e-06, "loss": 0.4413, "step": 1940 }, { "epoch": 0.09848236155652636, "grad_norm": 2.8583722906916056, "learning_rate": 9.016567330033338e-06, "loss": 0.4444, "step": 1950 }, { "epoch": 0.09898739930809829, "grad_norm": 2.3144131776880763, "learning_rate": 9.011516314779272e-06, "loss": 0.4444, "step": 1960 }, { "epoch": 0.09949243705967022, "grad_norm": 2.1581296348642938, "learning_rate": 9.006465299525206e-06, "loss": 0.4335, "step": 1970 }, { "epoch": 0.09999747481124215, "grad_norm": 17.52409284022625, "learning_rate": 9.00141428427114e-06, "loss": 0.4315, "step": 1980 }, { "epoch": 0.10050251256281408, "grad_norm": 4.993014635080338, "learning_rate": 8.996363269017074e-06, "loss": 0.4397, "step": 1990 }, { "epoch": 0.101007550314386, "grad_norm": 2.877086759384337, "learning_rate": 8.991312253763007e-06, "loss": 0.4399, "step": 2000 }, { "epoch": 0.10151258806595793, "grad_norm": 1.8400304810419659, "learning_rate": 8.98626123850894e-06, "loss": 0.4362, "step": 2010 }, { "epoch": 0.10201762581752986, "grad_norm": 2.9024549851445993, "learning_rate": 8.981210223254875e-06, "loss": 0.4408, "step": 2020 }, { "epoch": 0.1025226635691018, "grad_norm": 2.0461267797053067, "learning_rate": 8.976159208000809e-06, "loss": 0.432, "step": 2030 }, { "epoch": 0.10302770132067372, "grad_norm": 3.503082468120821, "learning_rate": 8.971108192746743e-06, "loss": 0.4325, "step": 2040 }, { "epoch": 0.10353273907224565, "grad_norm": 2.388506460672893, "learning_rate": 8.966057177492677e-06, "loss": 0.4351, "step": 2050 }, { "epoch": 0.10403777682381758, "grad_norm": 2.1254282185067646, "learning_rate": 8.96100616223861e-06, "loss": 0.4471, "step": 2060 }, { "epoch": 0.10454281457538951, "grad_norm": 7.57345126491693, "learning_rate": 8.955955146984545e-06, "loss": 0.4397, "step": 2070 }, { "epoch": 0.10504785232696144, "grad_norm": 1.6413054443187884, "learning_rate": 8.95090413173048e-06, "loss": 0.4322, "step": 2080 }, { "epoch": 0.10555289007853337, "grad_norm": 1.8503237766357117, "learning_rate": 8.945853116476414e-06, "loss": 0.4599, "step": 2090 }, { "epoch": 0.1060579278301053, "grad_norm": 2.122766063633365, "learning_rate": 8.940802101222346e-06, "loss": 0.4297, "step": 2100 }, { "epoch": 0.10656296558167723, "grad_norm": 1.859081726279874, "learning_rate": 8.93575108596828e-06, "loss": 0.4448, "step": 2110 }, { "epoch": 0.10706800333324916, "grad_norm": 2.477456287176706, "learning_rate": 8.930700070714214e-06, "loss": 0.4358, "step": 2120 }, { "epoch": 0.10757304108482109, "grad_norm": 1.642087049827079, "learning_rate": 8.925649055460148e-06, "loss": 0.4549, "step": 2130 }, { "epoch": 0.10807807883639302, "grad_norm": 3.1351745453532818, "learning_rate": 8.920598040206083e-06, "loss": 0.4482, "step": 2140 }, { "epoch": 0.10858311658796495, "grad_norm": 3.2350948921462805, "learning_rate": 8.915547024952017e-06, "loss": 0.4426, "step": 2150 }, { "epoch": 0.10908815433953688, "grad_norm": 2.4276194333582986, "learning_rate": 8.910496009697949e-06, "loss": 0.4523, "step": 2160 }, { "epoch": 0.10959319209110881, "grad_norm": 2.425429246229103, "learning_rate": 8.905444994443883e-06, "loss": 0.4381, "step": 2170 }, { "epoch": 0.11009822984268074, "grad_norm": 2.8737600406248682, "learning_rate": 8.900393979189817e-06, "loss": 0.4355, "step": 2180 }, { "epoch": 0.11060326759425267, "grad_norm": 1.868046294608527, "learning_rate": 8.895342963935751e-06, "loss": 0.4445, "step": 2190 }, { "epoch": 0.1111083053458246, "grad_norm": 7.059764277040628, "learning_rate": 8.890291948681686e-06, "loss": 0.451, "step": 2200 }, { "epoch": 0.11161334309739653, "grad_norm": 2.155018266737317, "learning_rate": 8.88524093342762e-06, "loss": 0.4421, "step": 2210 }, { "epoch": 0.11211838084896845, "grad_norm": 9.177888788340095, "learning_rate": 8.880189918173554e-06, "loss": 0.4398, "step": 2220 }, { "epoch": 0.11262341860054038, "grad_norm": 3.143167600541576, "learning_rate": 8.875138902919488e-06, "loss": 0.4297, "step": 2230 }, { "epoch": 0.11312845635211231, "grad_norm": 2.152834452055092, "learning_rate": 8.870087887665422e-06, "loss": 0.4529, "step": 2240 }, { "epoch": 0.11363349410368426, "grad_norm": 14.793812918224992, "learning_rate": 8.865036872411356e-06, "loss": 0.4383, "step": 2250 }, { "epoch": 0.11413853185525619, "grad_norm": 3.45745902653208, "learning_rate": 8.859985857157289e-06, "loss": 0.4302, "step": 2260 }, { "epoch": 0.11464356960682812, "grad_norm": 2.7114403885624605, "learning_rate": 8.854934841903223e-06, "loss": 0.4251, "step": 2270 }, { "epoch": 0.11514860735840005, "grad_norm": 2.723960534334651, "learning_rate": 8.849883826649157e-06, "loss": 0.4286, "step": 2280 }, { "epoch": 0.11565364510997198, "grad_norm": 2.4905833599831415, "learning_rate": 8.844832811395091e-06, "loss": 0.4508, "step": 2290 }, { "epoch": 0.1161586828615439, "grad_norm": 20.83563549184929, "learning_rate": 8.839781796141025e-06, "loss": 0.4474, "step": 2300 }, { "epoch": 0.11666372061311583, "grad_norm": 3.9724073335359336, "learning_rate": 8.83473078088696e-06, "loss": 0.4476, "step": 2310 }, { "epoch": 0.11716875836468776, "grad_norm": 2.1463739260782106, "learning_rate": 8.829679765632892e-06, "loss": 0.4189, "step": 2320 }, { "epoch": 0.1176737961162597, "grad_norm": 3.592083818788885, "learning_rate": 8.824628750378826e-06, "loss": 0.4242, "step": 2330 }, { "epoch": 0.11817883386783162, "grad_norm": 2.548758001659721, "learning_rate": 8.819577735124762e-06, "loss": 0.4112, "step": 2340 }, { "epoch": 0.11868387161940355, "grad_norm": 2.713449956929624, "learning_rate": 8.814526719870696e-06, "loss": 0.4173, "step": 2350 }, { "epoch": 0.11918890937097548, "grad_norm": 3.2378838703432016, "learning_rate": 8.80947570461663e-06, "loss": 0.4256, "step": 2360 }, { "epoch": 0.11969394712254741, "grad_norm": 3.8665508352583013, "learning_rate": 8.804424689362562e-06, "loss": 0.4241, "step": 2370 }, { "epoch": 0.12019898487411934, "grad_norm": 4.556177393216747, "learning_rate": 8.799373674108496e-06, "loss": 0.4282, "step": 2380 }, { "epoch": 0.12070402262569127, "grad_norm": 3.687650501163452, "learning_rate": 8.79432265885443e-06, "loss": 0.4242, "step": 2390 }, { "epoch": 0.1212090603772632, "grad_norm": 2.9530727834279595, "learning_rate": 8.789271643600365e-06, "loss": 0.4301, "step": 2400 }, { "epoch": 0.12171409812883513, "grad_norm": 2.916996176159921, "learning_rate": 8.784220628346299e-06, "loss": 0.4284, "step": 2410 }, { "epoch": 0.12221913588040706, "grad_norm": 2.2417118662412734, "learning_rate": 8.779169613092233e-06, "loss": 0.4186, "step": 2420 }, { "epoch": 0.12272417363197899, "grad_norm": 2.947908220653578, "learning_rate": 8.774118597838165e-06, "loss": 0.4223, "step": 2430 }, { "epoch": 0.12322921138355092, "grad_norm": 2.071616410723938, "learning_rate": 8.7690675825841e-06, "loss": 0.4375, "step": 2440 }, { "epoch": 0.12373424913512285, "grad_norm": 2.8361443264190296, "learning_rate": 8.764016567330034e-06, "loss": 0.426, "step": 2450 }, { "epoch": 0.12423928688669478, "grad_norm": 2.820818503243756, "learning_rate": 8.758965552075968e-06, "loss": 0.4341, "step": 2460 }, { "epoch": 0.12474432463826671, "grad_norm": 3.136410856304739, "learning_rate": 8.753914536821902e-06, "loss": 0.4445, "step": 2470 }, { "epoch": 0.12524936238983864, "grad_norm": 8.047547235777753, "learning_rate": 8.748863521567836e-06, "loss": 0.4267, "step": 2480 }, { "epoch": 0.12575440014141057, "grad_norm": 3.892449775126266, "learning_rate": 8.74381250631377e-06, "loss": 0.419, "step": 2490 }, { "epoch": 0.1262594378929825, "grad_norm": 2.354897593217238, "learning_rate": 8.738761491059704e-06, "loss": 0.4272, "step": 2500 }, { "epoch": 0.12676447564455443, "grad_norm": 2.5811852973740566, "learning_rate": 8.733710475805638e-06, "loss": 0.4253, "step": 2510 }, { "epoch": 0.12726951339612635, "grad_norm": 6.105044708207583, "learning_rate": 8.728659460551572e-06, "loss": 0.4214, "step": 2520 }, { "epoch": 0.12777455114769828, "grad_norm": 6.168633495733829, "learning_rate": 8.723608445297505e-06, "loss": 0.4374, "step": 2530 }, { "epoch": 0.1282795888992702, "grad_norm": 3.3331523293740677, "learning_rate": 8.718557430043439e-06, "loss": 0.4221, "step": 2540 }, { "epoch": 0.12878462665084214, "grad_norm": 3.1244184035542895, "learning_rate": 8.713506414789373e-06, "loss": 0.4362, "step": 2550 }, { "epoch": 0.12928966440241407, "grad_norm": 3.438277611759734, "learning_rate": 8.708455399535307e-06, "loss": 0.4193, "step": 2560 }, { "epoch": 0.129794702153986, "grad_norm": 5.520539408202482, "learning_rate": 8.703404384281241e-06, "loss": 0.4057, "step": 2570 }, { "epoch": 0.13029973990555793, "grad_norm": 5.422130415998936, "learning_rate": 8.698353369027176e-06, "loss": 0.4384, "step": 2580 }, { "epoch": 0.13080477765712986, "grad_norm": 3.341989913378195, "learning_rate": 8.693302353773108e-06, "loss": 0.4162, "step": 2590 }, { "epoch": 0.1313098154087018, "grad_norm": 2.475515489410939, "learning_rate": 8.688251338519042e-06, "loss": 0.4348, "step": 2600 }, { "epoch": 0.13181485316027372, "grad_norm": 2.550493025405498, "learning_rate": 8.683200323264976e-06, "loss": 0.4173, "step": 2610 }, { "epoch": 0.13231989091184565, "grad_norm": 2.0333378304030956, "learning_rate": 8.678149308010912e-06, "loss": 0.423, "step": 2620 }, { "epoch": 0.13282492866341758, "grad_norm": 2.2902374319536176, "learning_rate": 8.673098292756844e-06, "loss": 0.443, "step": 2630 }, { "epoch": 0.1333299664149895, "grad_norm": 2.774814046594427, "learning_rate": 8.668047277502779e-06, "loss": 0.4174, "step": 2640 }, { "epoch": 0.13383500416656144, "grad_norm": 3.8730252519897013, "learning_rate": 8.662996262248713e-06, "loss": 0.4248, "step": 2650 }, { "epoch": 0.13434004191813337, "grad_norm": 2.6952038500341127, "learning_rate": 8.657945246994647e-06, "loss": 0.4117, "step": 2660 }, { "epoch": 0.1348450796697053, "grad_norm": 2.7361849988399634, "learning_rate": 8.652894231740581e-06, "loss": 0.4249, "step": 2670 }, { "epoch": 0.13535011742127723, "grad_norm": 2.9758725333203535, "learning_rate": 8.647843216486515e-06, "loss": 0.4381, "step": 2680 }, { "epoch": 0.13585515517284916, "grad_norm": 2.551067170047961, "learning_rate": 8.642792201232447e-06, "loss": 0.4214, "step": 2690 }, { "epoch": 0.13636019292442111, "grad_norm": 2.6334795048127724, "learning_rate": 8.637741185978382e-06, "loss": 0.427, "step": 2700 }, { "epoch": 0.13686523067599304, "grad_norm": 3.9869976953943866, "learning_rate": 8.632690170724316e-06, "loss": 0.4101, "step": 2710 }, { "epoch": 0.13737026842756497, "grad_norm": 3.313933800574665, "learning_rate": 8.62763915547025e-06, "loss": 0.4138, "step": 2720 }, { "epoch": 0.1378753061791369, "grad_norm": 3.641595792417612, "learning_rate": 8.622588140216184e-06, "loss": 0.4256, "step": 2730 }, { "epoch": 0.13838034393070883, "grad_norm": 5.36751356787116, "learning_rate": 8.617537124962118e-06, "loss": 0.4314, "step": 2740 }, { "epoch": 0.13888538168228076, "grad_norm": 2.5991069721105977, "learning_rate": 8.612486109708052e-06, "loss": 0.4157, "step": 2750 }, { "epoch": 0.1393904194338527, "grad_norm": 3.1641348139616463, "learning_rate": 8.607435094453986e-06, "loss": 0.4023, "step": 2760 }, { "epoch": 0.13989545718542462, "grad_norm": 3.021572554543311, "learning_rate": 8.60238407919992e-06, "loss": 0.4138, "step": 2770 }, { "epoch": 0.14040049493699655, "grad_norm": 3.0773716536091063, "learning_rate": 8.597333063945855e-06, "loss": 0.4347, "step": 2780 }, { "epoch": 0.14090553268856848, "grad_norm": 3.1572917500612236, "learning_rate": 8.592282048691789e-06, "loss": 0.425, "step": 2790 }, { "epoch": 0.1414105704401404, "grad_norm": 3.065547472806378, "learning_rate": 8.587231033437721e-06, "loss": 0.4227, "step": 2800 }, { "epoch": 0.14191560819171234, "grad_norm": 3.0283835796400593, "learning_rate": 8.582180018183655e-06, "loss": 0.4126, "step": 2810 }, { "epoch": 0.14242064594328427, "grad_norm": 3.173481302073063, "learning_rate": 8.57712900292959e-06, "loss": 0.434, "step": 2820 }, { "epoch": 0.1429256836948562, "grad_norm": 3.765724472944031, "learning_rate": 8.572077987675524e-06, "loss": 0.4463, "step": 2830 }, { "epoch": 0.14343072144642813, "grad_norm": 5.419243180691885, "learning_rate": 8.567026972421458e-06, "loss": 0.4213, "step": 2840 }, { "epoch": 0.14393575919800006, "grad_norm": 7.3295747683680865, "learning_rate": 8.561975957167392e-06, "loss": 0.4202, "step": 2850 }, { "epoch": 0.144440796949572, "grad_norm": 6.887688386053364, "learning_rate": 8.556924941913324e-06, "loss": 0.4238, "step": 2860 }, { "epoch": 0.14494583470114392, "grad_norm": 2.786632234737629, "learning_rate": 8.551873926659258e-06, "loss": 0.431, "step": 2870 }, { "epoch": 0.14545087245271585, "grad_norm": 2.60607889031576, "learning_rate": 8.546822911405192e-06, "loss": 0.4306, "step": 2880 }, { "epoch": 0.14595591020428778, "grad_norm": 8.477010546599903, "learning_rate": 8.541771896151128e-06, "loss": 0.4419, "step": 2890 }, { "epoch": 0.1464609479558597, "grad_norm": 2.27833113141815, "learning_rate": 8.53672088089706e-06, "loss": 0.4226, "step": 2900 }, { "epoch": 0.14696598570743163, "grad_norm": 4.423453943948206, "learning_rate": 8.531669865642995e-06, "loss": 0.439, "step": 2910 }, { "epoch": 0.14747102345900356, "grad_norm": 3.6944661359694013, "learning_rate": 8.526618850388929e-06, "loss": 0.4221, "step": 2920 }, { "epoch": 0.1479760612105755, "grad_norm": 5.078989868876925, "learning_rate": 8.521567835134863e-06, "loss": 0.4317, "step": 2930 }, { "epoch": 0.14848109896214742, "grad_norm": 3.37535562195142, "learning_rate": 8.516516819880797e-06, "loss": 0.4365, "step": 2940 }, { "epoch": 0.14898613671371935, "grad_norm": 4.387702072726384, "learning_rate": 8.511465804626731e-06, "loss": 0.4133, "step": 2950 }, { "epoch": 0.14949117446529128, "grad_norm": 4.377549617088979, "learning_rate": 8.506414789372664e-06, "loss": 0.4332, "step": 2960 }, { "epoch": 0.1499962122168632, "grad_norm": 3.6071548234377033, "learning_rate": 8.501363774118598e-06, "loss": 0.424, "step": 2970 }, { "epoch": 0.15050124996843514, "grad_norm": 3.2776129282491953, "learning_rate": 8.496312758864532e-06, "loss": 0.431, "step": 2980 }, { "epoch": 0.15100628772000707, "grad_norm": 4.474289915888015, "learning_rate": 8.491261743610466e-06, "loss": 0.4289, "step": 2990 }, { "epoch": 0.151511325471579, "grad_norm": 2.562841849224475, "learning_rate": 8.4862107283564e-06, "loss": 0.4291, "step": 3000 }, { "epoch": 0.15201636322315093, "grad_norm": 2.9179612414883325, "learning_rate": 8.481159713102334e-06, "loss": 0.4187, "step": 3010 }, { "epoch": 0.15252140097472286, "grad_norm": 3.2632959036982703, "learning_rate": 8.476108697848268e-06, "loss": 0.4212, "step": 3020 }, { "epoch": 0.1530264387262948, "grad_norm": 3.962692563420196, "learning_rate": 8.471057682594203e-06, "loss": 0.4228, "step": 3030 }, { "epoch": 0.15353147647786672, "grad_norm": 3.2213155600732803, "learning_rate": 8.466006667340137e-06, "loss": 0.4271, "step": 3040 }, { "epoch": 0.15403651422943865, "grad_norm": 2.3479787665404555, "learning_rate": 8.460955652086071e-06, "loss": 0.4329, "step": 3050 }, { "epoch": 0.15454155198101058, "grad_norm": 2.7693374782815035, "learning_rate": 8.455904636832003e-06, "loss": 0.4168, "step": 3060 }, { "epoch": 0.1550465897325825, "grad_norm": 2.7643943115933314, "learning_rate": 8.450853621577937e-06, "loss": 0.4145, "step": 3070 }, { "epoch": 0.15555162748415444, "grad_norm": 2.6062282259447422, "learning_rate": 8.445802606323872e-06, "loss": 0.4191, "step": 3080 }, { "epoch": 0.15605666523572637, "grad_norm": 2.60763373572543, "learning_rate": 8.440751591069806e-06, "loss": 0.4161, "step": 3090 }, { "epoch": 0.1565617029872983, "grad_norm": 2.4475412139703, "learning_rate": 8.43570057581574e-06, "loss": 0.4273, "step": 3100 }, { "epoch": 0.15706674073887023, "grad_norm": 2.9309156363483084, "learning_rate": 8.430649560561674e-06, "loss": 0.4042, "step": 3110 }, { "epoch": 0.15757177849044215, "grad_norm": 2.2191638521741845, "learning_rate": 8.425598545307608e-06, "loss": 0.4306, "step": 3120 }, { "epoch": 0.15807681624201408, "grad_norm": 2.0790541165617893, "learning_rate": 8.42054753005354e-06, "loss": 0.4219, "step": 3130 }, { "epoch": 0.158581853993586, "grad_norm": 2.344144785550955, "learning_rate": 8.415496514799475e-06, "loss": 0.44, "step": 3140 }, { "epoch": 0.15908689174515794, "grad_norm": 3.896107585478935, "learning_rate": 8.410445499545409e-06, "loss": 0.4196, "step": 3150 }, { "epoch": 0.15959192949672987, "grad_norm": 3.511331268768427, "learning_rate": 8.405394484291345e-06, "loss": 0.4138, "step": 3160 }, { "epoch": 0.1600969672483018, "grad_norm": 4.918842428536537, "learning_rate": 8.400343469037277e-06, "loss": 0.426, "step": 3170 }, { "epoch": 0.16060200499987373, "grad_norm": 2.670823674907055, "learning_rate": 8.395292453783211e-06, "loss": 0.4099, "step": 3180 }, { "epoch": 0.16110704275144566, "grad_norm": 2.125448197180906, "learning_rate": 8.390241438529145e-06, "loss": 0.4186, "step": 3190 }, { "epoch": 0.1616120805030176, "grad_norm": 1.971202039415502, "learning_rate": 8.38519042327508e-06, "loss": 0.4307, "step": 3200 }, { "epoch": 0.16211711825458952, "grad_norm": 3.637785251447948, "learning_rate": 8.380139408021013e-06, "loss": 0.4315, "step": 3210 }, { "epoch": 0.16262215600616145, "grad_norm": 3.1909501990683027, "learning_rate": 8.375088392766948e-06, "loss": 0.4103, "step": 3220 }, { "epoch": 0.16312719375773338, "grad_norm": 3.426808590014172, "learning_rate": 8.37003737751288e-06, "loss": 0.4178, "step": 3230 }, { "epoch": 0.1636322315093053, "grad_norm": 2.794301511590198, "learning_rate": 8.364986362258814e-06, "loss": 0.421, "step": 3240 }, { "epoch": 0.16413726926087724, "grad_norm": 2.6521369002493196, "learning_rate": 8.359935347004748e-06, "loss": 0.4279, "step": 3250 }, { "epoch": 0.16464230701244917, "grad_norm": 3.065473735931392, "learning_rate": 8.354884331750682e-06, "loss": 0.4242, "step": 3260 }, { "epoch": 0.1651473447640211, "grad_norm": 2.3676634296220826, "learning_rate": 8.349833316496616e-06, "loss": 0.4004, "step": 3270 }, { "epoch": 0.16565238251559303, "grad_norm": 3.0123313610182407, "learning_rate": 8.34478230124255e-06, "loss": 0.4168, "step": 3280 }, { "epoch": 0.16615742026716496, "grad_norm": 2.613652117431061, "learning_rate": 8.339731285988485e-06, "loss": 0.4032, "step": 3290 }, { "epoch": 0.16666245801873691, "grad_norm": 2.5495103609008383, "learning_rate": 8.334680270734419e-06, "loss": 0.4258, "step": 3300 }, { "epoch": 0.16716749577030884, "grad_norm": 1.944110652634474, "learning_rate": 8.329629255480353e-06, "loss": 0.4047, "step": 3310 }, { "epoch": 0.16767253352188077, "grad_norm": 3.996554861787568, "learning_rate": 8.324578240226287e-06, "loss": 0.4266, "step": 3320 }, { "epoch": 0.1681775712734527, "grad_norm": 3.311463894609568, "learning_rate": 8.31952722497222e-06, "loss": 0.4142, "step": 3330 }, { "epoch": 0.16868260902502463, "grad_norm": 2.6266078214610418, "learning_rate": 8.314476209718154e-06, "loss": 0.4106, "step": 3340 }, { "epoch": 0.16918764677659656, "grad_norm": 3.3843962429509533, "learning_rate": 8.309425194464088e-06, "loss": 0.4023, "step": 3350 }, { "epoch": 0.1696926845281685, "grad_norm": 2.755113613993984, "learning_rate": 8.304374179210022e-06, "loss": 0.3974, "step": 3360 }, { "epoch": 0.17019772227974042, "grad_norm": 3.915262353619256, "learning_rate": 8.299323163955956e-06, "loss": 0.4233, "step": 3370 }, { "epoch": 0.17070276003131235, "grad_norm": 6.128222421561234, "learning_rate": 8.29427214870189e-06, "loss": 0.3987, "step": 3380 }, { "epoch": 0.17120779778288428, "grad_norm": 3.244731601327553, "learning_rate": 8.289221133447823e-06, "loss": 0.408, "step": 3390 }, { "epoch": 0.1717128355344562, "grad_norm": 3.415058459654747, "learning_rate": 8.284170118193757e-06, "loss": 0.4264, "step": 3400 }, { "epoch": 0.17221787328602814, "grad_norm": 4.870959665296901, "learning_rate": 8.27911910293969e-06, "loss": 0.4068, "step": 3410 }, { "epoch": 0.17272291103760007, "grad_norm": 3.1738996571406837, "learning_rate": 8.274068087685625e-06, "loss": 0.4238, "step": 3420 }, { "epoch": 0.173227948789172, "grad_norm": 5.015822248252729, "learning_rate": 8.269017072431559e-06, "loss": 0.4258, "step": 3430 }, { "epoch": 0.17373298654074393, "grad_norm": 5.826443535049107, "learning_rate": 8.263966057177493e-06, "loss": 0.389, "step": 3440 }, { "epoch": 0.17423802429231586, "grad_norm": 3.9268362077276366, "learning_rate": 8.258915041923427e-06, "loss": 0.4302, "step": 3450 }, { "epoch": 0.1747430620438878, "grad_norm": 2.7948709929923155, "learning_rate": 8.253864026669361e-06, "loss": 0.4179, "step": 3460 }, { "epoch": 0.17524809979545972, "grad_norm": 7.055728348112125, "learning_rate": 8.248813011415296e-06, "loss": 0.4249, "step": 3470 }, { "epoch": 0.17575313754703165, "grad_norm": 11.223550772273683, "learning_rate": 8.24376199616123e-06, "loss": 0.4198, "step": 3480 }, { "epoch": 0.17625817529860358, "grad_norm": 3.272872614673025, "learning_rate": 8.238710980907164e-06, "loss": 0.412, "step": 3490 }, { "epoch": 0.1767632130501755, "grad_norm": 4.687786747360582, "learning_rate": 8.233659965653096e-06, "loss": 0.4143, "step": 3500 }, { "epoch": 0.17726825080174743, "grad_norm": 4.2131701830416315, "learning_rate": 8.22860895039903e-06, "loss": 0.407, "step": 3510 }, { "epoch": 0.17777328855331936, "grad_norm": 2.7350467089835058, "learning_rate": 8.223557935144965e-06, "loss": 0.4065, "step": 3520 }, { "epoch": 0.1782783263048913, "grad_norm": 15.963906126522279, "learning_rate": 8.218506919890899e-06, "loss": 0.4053, "step": 3530 }, { "epoch": 0.17878336405646322, "grad_norm": 6.283377242483946, "learning_rate": 8.213455904636833e-06, "loss": 0.4398, "step": 3540 }, { "epoch": 0.17928840180803515, "grad_norm": 4.211980280469181, "learning_rate": 8.208404889382767e-06, "loss": 0.4279, "step": 3550 }, { "epoch": 0.17979343955960708, "grad_norm": 4.888324462942457, "learning_rate": 8.2033538741287e-06, "loss": 0.4279, "step": 3560 }, { "epoch": 0.180298477311179, "grad_norm": 4.806718275911088, "learning_rate": 8.198302858874635e-06, "loss": 0.4074, "step": 3570 }, { "epoch": 0.18080351506275094, "grad_norm": 14.618909757426387, "learning_rate": 8.19325184362057e-06, "loss": 0.4377, "step": 3580 }, { "epoch": 0.18130855281432287, "grad_norm": 3.0145598452408935, "learning_rate": 8.188200828366503e-06, "loss": 0.4244, "step": 3590 }, { "epoch": 0.1818135905658948, "grad_norm": 3.537451616849706, "learning_rate": 8.183149813112436e-06, "loss": 0.4115, "step": 3600 }, { "epoch": 0.18231862831746673, "grad_norm": 6.760898974505609, "learning_rate": 8.17809879785837e-06, "loss": 0.4183, "step": 3610 }, { "epoch": 0.18282366606903866, "grad_norm": 3.1669599365914927, "learning_rate": 8.173047782604304e-06, "loss": 0.4116, "step": 3620 }, { "epoch": 0.1833287038206106, "grad_norm": 15.662389012118272, "learning_rate": 8.167996767350238e-06, "loss": 0.4208, "step": 3630 }, { "epoch": 0.18383374157218252, "grad_norm": 7.978949404174967, "learning_rate": 8.162945752096172e-06, "loss": 0.4355, "step": 3640 }, { "epoch": 0.18433877932375445, "grad_norm": 3.0917950299227375, "learning_rate": 8.157894736842106e-06, "loss": 0.4161, "step": 3650 }, { "epoch": 0.18484381707532638, "grad_norm": 5.585272554418732, "learning_rate": 8.152843721588039e-06, "loss": 0.4206, "step": 3660 }, { "epoch": 0.1853488548268983, "grad_norm": 2.7026089034564875, "learning_rate": 8.147792706333973e-06, "loss": 0.4092, "step": 3670 }, { "epoch": 0.18585389257847024, "grad_norm": 4.859931210791958, "learning_rate": 8.142741691079907e-06, "loss": 0.4064, "step": 3680 }, { "epoch": 0.18635893033004217, "grad_norm": 3.403058240228029, "learning_rate": 8.137690675825841e-06, "loss": 0.4082, "step": 3690 }, { "epoch": 0.1868639680816141, "grad_norm": 3.2005619654009076, "learning_rate": 8.132639660571775e-06, "loss": 0.4042, "step": 3700 }, { "epoch": 0.18736900583318603, "grad_norm": 2.6974451416766443, "learning_rate": 8.12758864531771e-06, "loss": 0.4291, "step": 3710 }, { "epoch": 0.18787404358475795, "grad_norm": 3.226834318122475, "learning_rate": 8.122537630063644e-06, "loss": 0.4173, "step": 3720 }, { "epoch": 0.18837908133632988, "grad_norm": 2.6103166184994895, "learning_rate": 8.117486614809578e-06, "loss": 0.4112, "step": 3730 }, { "epoch": 0.18888411908790181, "grad_norm": 2.691120701877213, "learning_rate": 8.112435599555512e-06, "loss": 0.4083, "step": 3740 }, { "epoch": 0.18938915683947374, "grad_norm": 3.323955083237021, "learning_rate": 8.107384584301446e-06, "loss": 0.4096, "step": 3750 }, { "epoch": 0.18989419459104567, "grad_norm": 6.965415675773204, "learning_rate": 8.102333569047378e-06, "loss": 0.3977, "step": 3760 }, { "epoch": 0.1903992323426176, "grad_norm": 2.451525045869991, "learning_rate": 8.097282553793313e-06, "loss": 0.4195, "step": 3770 }, { "epoch": 0.19090427009418953, "grad_norm": 2.6787347183394243, "learning_rate": 8.092231538539247e-06, "loss": 0.4121, "step": 3780 }, { "epoch": 0.19140930784576146, "grad_norm": 6.742330893738729, "learning_rate": 8.08718052328518e-06, "loss": 0.4102, "step": 3790 }, { "epoch": 0.1919143455973334, "grad_norm": 2.9458958646737083, "learning_rate": 8.082129508031115e-06, "loss": 0.4206, "step": 3800 }, { "epoch": 0.19241938334890532, "grad_norm": 4.018744503984466, "learning_rate": 8.077078492777049e-06, "loss": 0.4367, "step": 3810 }, { "epoch": 0.19292442110047725, "grad_norm": 4.594358535206507, "learning_rate": 8.072027477522983e-06, "loss": 0.4123, "step": 3820 }, { "epoch": 0.19342945885204918, "grad_norm": 7.765118457446688, "learning_rate": 8.066976462268916e-06, "loss": 0.4203, "step": 3830 }, { "epoch": 0.1939344966036211, "grad_norm": 4.571772892446487, "learning_rate": 8.061925447014851e-06, "loss": 0.4093, "step": 3840 }, { "epoch": 0.19443953435519304, "grad_norm": 3.0704420404585995, "learning_rate": 8.056874431760786e-06, "loss": 0.4195, "step": 3850 }, { "epoch": 0.19494457210676497, "grad_norm": 2.8235987578400334, "learning_rate": 8.05182341650672e-06, "loss": 0.4131, "step": 3860 }, { "epoch": 0.1954496098583369, "grad_norm": 4.664925999855343, "learning_rate": 8.046772401252652e-06, "loss": 0.4052, "step": 3870 }, { "epoch": 0.19595464760990883, "grad_norm": 3.0583937314253, "learning_rate": 8.041721385998586e-06, "loss": 0.4198, "step": 3880 }, { "epoch": 0.19645968536148076, "grad_norm": 2.973956981634855, "learning_rate": 8.03667037074452e-06, "loss": 0.4066, "step": 3890 }, { "epoch": 0.19696472311305271, "grad_norm": 2.803835666144933, "learning_rate": 8.031619355490454e-06, "loss": 0.4158, "step": 3900 }, { "epoch": 0.19746976086462464, "grad_norm": 4.834280824568942, "learning_rate": 8.026568340236389e-06, "loss": 0.4178, "step": 3910 }, { "epoch": 0.19797479861619657, "grad_norm": 10.038307243068243, "learning_rate": 8.021517324982323e-06, "loss": 0.4111, "step": 3920 }, { "epoch": 0.1984798363677685, "grad_norm": 7.610030059675689, "learning_rate": 8.016466309728255e-06, "loss": 0.4096, "step": 3930 }, { "epoch": 0.19898487411934043, "grad_norm": 3.6050764165551556, "learning_rate": 8.01141529447419e-06, "loss": 0.4233, "step": 3940 }, { "epoch": 0.19948991187091236, "grad_norm": 6.736237771102318, "learning_rate": 8.006364279220123e-06, "loss": 0.4153, "step": 3950 }, { "epoch": 0.1999949496224843, "grad_norm": 2.9583460730423443, "learning_rate": 8.001313263966057e-06, "loss": 0.4211, "step": 3960 }, { "epoch": 0.20049998737405622, "grad_norm": 2.2968014860075576, "learning_rate": 7.996262248711992e-06, "loss": 0.4289, "step": 3970 }, { "epoch": 0.20100502512562815, "grad_norm": 2.474011765446813, "learning_rate": 7.991211233457926e-06, "loss": 0.4233, "step": 3980 }, { "epoch": 0.20151006287720008, "grad_norm": 3.0063110556198778, "learning_rate": 7.98616021820386e-06, "loss": 0.428, "step": 3990 }, { "epoch": 0.202015100628772, "grad_norm": 4.355361962575098, "learning_rate": 7.981109202949794e-06, "loss": 0.4226, "step": 4000 }, { "epoch": 0.20252013838034394, "grad_norm": 2.6560259873816747, "learning_rate": 7.976058187695728e-06, "loss": 0.4175, "step": 4010 }, { "epoch": 0.20302517613191587, "grad_norm": 2.9619585376031647, "learning_rate": 7.971007172441662e-06, "loss": 0.4143, "step": 4020 }, { "epoch": 0.2035302138834878, "grad_norm": 2.066998306222369, "learning_rate": 7.965956157187595e-06, "loss": 0.4001, "step": 4030 }, { "epoch": 0.20403525163505973, "grad_norm": 4.632842633228617, "learning_rate": 7.960905141933529e-06, "loss": 0.4133, "step": 4040 }, { "epoch": 0.20454028938663166, "grad_norm": 6.512090967359411, "learning_rate": 7.955854126679463e-06, "loss": 0.4271, "step": 4050 }, { "epoch": 0.2050453271382036, "grad_norm": 3.4613990642774777, "learning_rate": 7.950803111425397e-06, "loss": 0.4035, "step": 4060 }, { "epoch": 0.20555036488977552, "grad_norm": 6.082524106916885, "learning_rate": 7.945752096171331e-06, "loss": 0.4186, "step": 4070 }, { "epoch": 0.20605540264134745, "grad_norm": 9.959451587245269, "learning_rate": 7.940701080917265e-06, "loss": 0.4136, "step": 4080 }, { "epoch": 0.20656044039291938, "grad_norm": 3.1580403384223694, "learning_rate": 7.935650065663198e-06, "loss": 0.4166, "step": 4090 }, { "epoch": 0.2070654781444913, "grad_norm": 43.253412774792665, "learning_rate": 7.930599050409132e-06, "loss": 0.4237, "step": 4100 }, { "epoch": 0.20757051589606323, "grad_norm": 4.161811983252057, "learning_rate": 7.925548035155068e-06, "loss": 0.426, "step": 4110 }, { "epoch": 0.20807555364763516, "grad_norm": 4.07148032877867, "learning_rate": 7.920497019901002e-06, "loss": 0.4046, "step": 4120 }, { "epoch": 0.2085805913992071, "grad_norm": 2.980398901316733, "learning_rate": 7.915446004646934e-06, "loss": 0.4018, "step": 4130 }, { "epoch": 0.20908562915077902, "grad_norm": 12.570414912224734, "learning_rate": 7.910394989392868e-06, "loss": 0.4249, "step": 4140 }, { "epoch": 0.20959066690235095, "grad_norm": 4.296760517181418, "learning_rate": 7.905343974138802e-06, "loss": 0.4134, "step": 4150 }, { "epoch": 0.21009570465392288, "grad_norm": 3.8924058247405795, "learning_rate": 7.900292958884737e-06, "loss": 0.4184, "step": 4160 }, { "epoch": 0.2106007424054948, "grad_norm": 5.074708812193615, "learning_rate": 7.89524194363067e-06, "loss": 0.4326, "step": 4170 }, { "epoch": 0.21110578015706674, "grad_norm": 4.094894279147386, "learning_rate": 7.890190928376605e-06, "loss": 0.427, "step": 4180 }, { "epoch": 0.21161081790863867, "grad_norm": 7.0180188428328245, "learning_rate": 7.885139913122539e-06, "loss": 0.4434, "step": 4190 }, { "epoch": 0.2121158556602106, "grad_norm": 3.1522461269078854, "learning_rate": 7.880088897868471e-06, "loss": 0.4118, "step": 4200 }, { "epoch": 0.21262089341178253, "grad_norm": 3.1387724966942048, "learning_rate": 7.875037882614405e-06, "loss": 0.4188, "step": 4210 }, { "epoch": 0.21312593116335446, "grad_norm": 2.6776667573826005, "learning_rate": 7.86998686736034e-06, "loss": 0.432, "step": 4220 }, { "epoch": 0.2136309689149264, "grad_norm": 3.6637419491007046, "learning_rate": 7.864935852106274e-06, "loss": 0.435, "step": 4230 }, { "epoch": 0.21413600666649832, "grad_norm": 5.882360941042325, "learning_rate": 7.859884836852208e-06, "loss": 0.4308, "step": 4240 }, { "epoch": 0.21464104441807025, "grad_norm": 8.010164762562015, "learning_rate": 7.854833821598142e-06, "loss": 0.4035, "step": 4250 }, { "epoch": 0.21514608216964218, "grad_norm": 2.940352809589755, "learning_rate": 7.849782806344076e-06, "loss": 0.4045, "step": 4260 }, { "epoch": 0.2156511199212141, "grad_norm": 5.768043762782633, "learning_rate": 7.84473179109001e-06, "loss": 0.4106, "step": 4270 }, { "epoch": 0.21615615767278604, "grad_norm": 4.169886235036406, "learning_rate": 7.839680775835944e-06, "loss": 0.4238, "step": 4280 }, { "epoch": 0.21666119542435797, "grad_norm": 3.249720384522653, "learning_rate": 7.834629760581878e-06, "loss": 0.3969, "step": 4290 }, { "epoch": 0.2171662331759299, "grad_norm": 5.065976709307023, "learning_rate": 7.829578745327811e-06, "loss": 0.4069, "step": 4300 }, { "epoch": 0.21767127092750183, "grad_norm": 3.9668832894751107, "learning_rate": 7.824527730073745e-06, "loss": 0.4125, "step": 4310 }, { "epoch": 0.21817630867907375, "grad_norm": 3.4619845168591796, "learning_rate": 7.819476714819679e-06, "loss": 0.4105, "step": 4320 }, { "epoch": 0.21868134643064568, "grad_norm": 3.886033142263623, "learning_rate": 7.814425699565613e-06, "loss": 0.4151, "step": 4330 }, { "epoch": 0.21918638418221761, "grad_norm": 3.5781821573741994, "learning_rate": 7.809374684311547e-06, "loss": 0.3923, "step": 4340 }, { "epoch": 0.21969142193378954, "grad_norm": 2.9767702411423325, "learning_rate": 7.804323669057482e-06, "loss": 0.3827, "step": 4350 }, { "epoch": 0.22019645968536147, "grad_norm": 3.3595860629684577, "learning_rate": 7.799272653803414e-06, "loss": 0.4103, "step": 4360 }, { "epoch": 0.2207014974369334, "grad_norm": 2.0952276159055154, "learning_rate": 7.794221638549348e-06, "loss": 0.4171, "step": 4370 }, { "epoch": 0.22120653518850533, "grad_norm": 3.1915802413191376, "learning_rate": 7.789170623295284e-06, "loss": 0.4273, "step": 4380 }, { "epoch": 0.22171157294007726, "grad_norm": 1.8629666469771935, "learning_rate": 7.784119608041218e-06, "loss": 0.4226, "step": 4390 }, { "epoch": 0.2222166106916492, "grad_norm": 4.627481223873417, "learning_rate": 7.77906859278715e-06, "loss": 0.3979, "step": 4400 }, { "epoch": 0.22272164844322112, "grad_norm": 3.4768855391600084, "learning_rate": 7.774017577533085e-06, "loss": 0.413, "step": 4410 }, { "epoch": 0.22322668619479305, "grad_norm": 2.8907573807850673, "learning_rate": 7.768966562279019e-06, "loss": 0.4201, "step": 4420 }, { "epoch": 0.22373172394636498, "grad_norm": 5.5052911721090805, "learning_rate": 7.763915547024953e-06, "loss": 0.4322, "step": 4430 }, { "epoch": 0.2242367616979369, "grad_norm": 2.171757782442608, "learning_rate": 7.758864531770887e-06, "loss": 0.4341, "step": 4440 }, { "epoch": 0.22474179944950884, "grad_norm": 2.092006999366198, "learning_rate": 7.753813516516821e-06, "loss": 0.422, "step": 4450 }, { "epoch": 0.22524683720108077, "grad_norm": 6.02493579048235, "learning_rate": 7.748762501262753e-06, "loss": 0.4105, "step": 4460 }, { "epoch": 0.2257518749526527, "grad_norm": 3.705631561875418, "learning_rate": 7.743711486008688e-06, "loss": 0.4081, "step": 4470 }, { "epoch": 0.22625691270422463, "grad_norm": 4.191020612596744, "learning_rate": 7.738660470754622e-06, "loss": 0.402, "step": 4480 }, { "epoch": 0.22676195045579656, "grad_norm": 2.9926161263513484, "learning_rate": 7.733609455500556e-06, "loss": 0.4105, "step": 4490 }, { "epoch": 0.22726698820736851, "grad_norm": 2.7804371886771304, "learning_rate": 7.72855844024649e-06, "loss": 0.4004, "step": 4500 }, { "epoch": 0.22777202595894044, "grad_norm": 3.3532410934862007, "learning_rate": 7.723507424992424e-06, "loss": 0.3982, "step": 4510 }, { "epoch": 0.22827706371051237, "grad_norm": 5.67085098197258, "learning_rate": 7.718456409738358e-06, "loss": 0.4244, "step": 4520 }, { "epoch": 0.2287821014620843, "grad_norm": 6.390172943127481, "learning_rate": 7.713405394484292e-06, "loss": 0.405, "step": 4530 }, { "epoch": 0.22928713921365623, "grad_norm": 2.0493376647118535, "learning_rate": 7.708354379230226e-06, "loss": 0.4091, "step": 4540 }, { "epoch": 0.22979217696522816, "grad_norm": 3.2998640142236373, "learning_rate": 7.70330336397616e-06, "loss": 0.4116, "step": 4550 }, { "epoch": 0.2302972147168001, "grad_norm": 9.39006495464318, "learning_rate": 7.698252348722095e-06, "loss": 0.4047, "step": 4560 }, { "epoch": 0.23080225246837202, "grad_norm": 5.551570211137529, "learning_rate": 7.693201333468027e-06, "loss": 0.408, "step": 4570 }, { "epoch": 0.23130729021994395, "grad_norm": 4.557076032338838, "learning_rate": 7.688150318213961e-06, "loss": 0.3971, "step": 4580 }, { "epoch": 0.23181232797151588, "grad_norm": 2.606206409240519, "learning_rate": 7.683099302959895e-06, "loss": 0.4178, "step": 4590 }, { "epoch": 0.2323173657230878, "grad_norm": 4.447142719835132, "learning_rate": 7.67804828770583e-06, "loss": 0.394, "step": 4600 }, { "epoch": 0.23282240347465974, "grad_norm": 3.0541458976753204, "learning_rate": 7.672997272451764e-06, "loss": 0.425, "step": 4610 }, { "epoch": 0.23332744122623167, "grad_norm": 2.176856844484498, "learning_rate": 7.667946257197698e-06, "loss": 0.4028, "step": 4620 }, { "epoch": 0.2338324789778036, "grad_norm": 2.308455454963439, "learning_rate": 7.66289524194363e-06, "loss": 0.4141, "step": 4630 }, { "epoch": 0.23433751672937553, "grad_norm": 2.112712712332131, "learning_rate": 7.657844226689564e-06, "loss": 0.419, "step": 4640 }, { "epoch": 0.23484255448094746, "grad_norm": 3.2748057827058634, "learning_rate": 7.6527932114355e-06, "loss": 0.4122, "step": 4650 }, { "epoch": 0.2353475922325194, "grad_norm": 4.614165718449686, "learning_rate": 7.647742196181434e-06, "loss": 0.4181, "step": 4660 }, { "epoch": 0.23585262998409132, "grad_norm": 3.4633789585985038, "learning_rate": 7.642691180927367e-06, "loss": 0.4134, "step": 4670 }, { "epoch": 0.23635766773566325, "grad_norm": 2.459474386028591, "learning_rate": 7.6376401656733e-06, "loss": 0.4045, "step": 4680 }, { "epoch": 0.23686270548723518, "grad_norm": 4.467572890592401, "learning_rate": 7.632589150419235e-06, "loss": 0.4108, "step": 4690 }, { "epoch": 0.2373677432388071, "grad_norm": 3.2733674833799085, "learning_rate": 7.627538135165169e-06, "loss": 0.4247, "step": 4700 }, { "epoch": 0.23787278099037903, "grad_norm": 2.5012350340127583, "learning_rate": 7.622487119911103e-06, "loss": 0.4027, "step": 4710 }, { "epoch": 0.23837781874195096, "grad_norm": 2.918885002659697, "learning_rate": 7.6174361046570365e-06, "loss": 0.3961, "step": 4720 }, { "epoch": 0.2388828564935229, "grad_norm": 2.151448240091896, "learning_rate": 7.612385089402971e-06, "loss": 0.397, "step": 4730 }, { "epoch": 0.23938789424509482, "grad_norm": 3.645676075183417, "learning_rate": 7.607334074148905e-06, "loss": 0.4042, "step": 4740 }, { "epoch": 0.23989293199666675, "grad_norm": 4.7934119111079365, "learning_rate": 7.602283058894838e-06, "loss": 0.4199, "step": 4750 }, { "epoch": 0.24039796974823868, "grad_norm": 3.024756725193816, "learning_rate": 7.597232043640772e-06, "loss": 0.4214, "step": 4760 }, { "epoch": 0.2409030074998106, "grad_norm": 3.8298473971048552, "learning_rate": 7.592181028386706e-06, "loss": 0.3971, "step": 4770 }, { "epoch": 0.24140804525138254, "grad_norm": 3.6045455173768897, "learning_rate": 7.5871300131326395e-06, "loss": 0.41, "step": 4780 }, { "epoch": 0.24191308300295447, "grad_norm": 2.4793807629242686, "learning_rate": 7.5820789978785745e-06, "loss": 0.4027, "step": 4790 }, { "epoch": 0.2424181207545264, "grad_norm": 2.7349340937466837, "learning_rate": 7.577027982624509e-06, "loss": 0.4354, "step": 4800 }, { "epoch": 0.24292315850609833, "grad_norm": 3.2738776137303436, "learning_rate": 7.571976967370443e-06, "loss": 0.4472, "step": 4810 }, { "epoch": 0.24342819625767026, "grad_norm": 13.785461532124051, "learning_rate": 7.566925952116376e-06, "loss": 0.3969, "step": 4820 }, { "epoch": 0.2439332340092422, "grad_norm": 3.313726734189114, "learning_rate": 7.56187493686231e-06, "loss": 0.4007, "step": 4830 }, { "epoch": 0.24443827176081412, "grad_norm": 4.099815087006132, "learning_rate": 7.556823921608244e-06, "loss": 0.4099, "step": 4840 }, { "epoch": 0.24494330951238605, "grad_norm": 4.723031766050033, "learning_rate": 7.5517729063541775e-06, "loss": 0.4204, "step": 4850 }, { "epoch": 0.24544834726395798, "grad_norm": 3.7013488115087085, "learning_rate": 7.546721891100112e-06, "loss": 0.3928, "step": 4860 }, { "epoch": 0.2459533850155299, "grad_norm": 7.683672740752602, "learning_rate": 7.541670875846046e-06, "loss": 0.4028, "step": 4870 }, { "epoch": 0.24645842276710184, "grad_norm": 5.671010001370461, "learning_rate": 7.536619860591979e-06, "loss": 0.433, "step": 4880 }, { "epoch": 0.24696346051867377, "grad_norm": 9.869009780517741, "learning_rate": 7.531568845337913e-06, "loss": 0.3989, "step": 4890 }, { "epoch": 0.2474684982702457, "grad_norm": 5.709197913641078, "learning_rate": 7.526517830083847e-06, "loss": 0.4031, "step": 4900 }, { "epoch": 0.24797353602181763, "grad_norm": 4.33540159587482, "learning_rate": 7.521466814829781e-06, "loss": 0.4062, "step": 4910 }, { "epoch": 0.24847857377338955, "grad_norm": 8.109361669653662, "learning_rate": 7.5164157995757156e-06, "loss": 0.3972, "step": 4920 }, { "epoch": 0.24898361152496148, "grad_norm": 13.57377976771627, "learning_rate": 7.51136478432165e-06, "loss": 0.4067, "step": 4930 }, { "epoch": 0.24948864927653341, "grad_norm": 4.077593295992246, "learning_rate": 7.506313769067584e-06, "loss": 0.3992, "step": 4940 }, { "epoch": 0.24999368702810534, "grad_norm": 6.370437009913983, "learning_rate": 7.501262753813517e-06, "loss": 0.41, "step": 4950 }, { "epoch": 0.2504987247796773, "grad_norm": 4.917853551887297, "learning_rate": 7.496211738559451e-06, "loss": 0.3967, "step": 4960 }, { "epoch": 0.25100376253124923, "grad_norm": 8.018747776559712, "learning_rate": 7.491160723305385e-06, "loss": 0.4174, "step": 4970 }, { "epoch": 0.25150880028282113, "grad_norm": 5.56863245620068, "learning_rate": 7.486109708051319e-06, "loss": 0.4161, "step": 4980 }, { "epoch": 0.2520138380343931, "grad_norm": 3.5246503817814405, "learning_rate": 7.481058692797253e-06, "loss": 0.4161, "step": 4990 }, { "epoch": 0.252518875785965, "grad_norm": 8.805921139909751, "learning_rate": 7.476007677543187e-06, "loss": 0.4045, "step": 5000 }, { "epoch": 0.25302391353753695, "grad_norm": 4.998987522917178, "learning_rate": 7.47095666228912e-06, "loss": 0.3859, "step": 5010 }, { "epoch": 0.25352895128910885, "grad_norm": 4.00150580245438, "learning_rate": 7.465905647035054e-06, "loss": 0.4164, "step": 5020 }, { "epoch": 0.2540339890406808, "grad_norm": 3.706340372471242, "learning_rate": 7.460854631780988e-06, "loss": 0.4176, "step": 5030 }, { "epoch": 0.2545390267922527, "grad_norm": 4.998624600906844, "learning_rate": 7.4558036165269225e-06, "loss": 0.41, "step": 5040 }, { "epoch": 0.25504406454382467, "grad_norm": 8.31884957399422, "learning_rate": 7.450752601272856e-06, "loss": 0.3998, "step": 5050 }, { "epoch": 0.25554910229539657, "grad_norm": 3.4901102806237314, "learning_rate": 7.445701586018791e-06, "loss": 0.3997, "step": 5060 }, { "epoch": 0.2560541400469685, "grad_norm": 4.645395770689467, "learning_rate": 7.440650570764725e-06, "loss": 0.4083, "step": 5070 }, { "epoch": 0.2565591777985404, "grad_norm": 6.384417075428531, "learning_rate": 7.435599555510659e-06, "loss": 0.4092, "step": 5080 }, { "epoch": 0.2570642155501124, "grad_norm": 2.4454349649277694, "learning_rate": 7.430548540256592e-06, "loss": 0.3848, "step": 5090 }, { "epoch": 0.2575692533016843, "grad_norm": 3.558542444645485, "learning_rate": 7.425497525002526e-06, "loss": 0.4088, "step": 5100 }, { "epoch": 0.25807429105325624, "grad_norm": 3.0244806698758713, "learning_rate": 7.4204465097484605e-06, "loss": 0.4039, "step": 5110 }, { "epoch": 0.25857932880482815, "grad_norm": 3.8966266585093767, "learning_rate": 7.415395494494394e-06, "loss": 0.4235, "step": 5120 }, { "epoch": 0.2590843665564001, "grad_norm": 5.6318294513336555, "learning_rate": 7.410344479240328e-06, "loss": 0.4037, "step": 5130 }, { "epoch": 0.259589404307972, "grad_norm": 3.0998441234985714, "learning_rate": 7.405293463986262e-06, "loss": 0.3978, "step": 5140 }, { "epoch": 0.26009444205954396, "grad_norm": 2.21396747445095, "learning_rate": 7.400242448732195e-06, "loss": 0.4145, "step": 5150 }, { "epoch": 0.26059947981111586, "grad_norm": 4.1662815020101585, "learning_rate": 7.3951914334781294e-06, "loss": 0.3868, "step": 5160 }, { "epoch": 0.2611045175626878, "grad_norm": 12.395354056627705, "learning_rate": 7.3901404182240636e-06, "loss": 0.4154, "step": 5170 }, { "epoch": 0.2616095553142597, "grad_norm": 3.1142309464667712, "learning_rate": 7.385089402969997e-06, "loss": 0.413, "step": 5180 }, { "epoch": 0.2621145930658317, "grad_norm": 4.513387929199787, "learning_rate": 7.380038387715931e-06, "loss": 0.3921, "step": 5190 }, { "epoch": 0.2626196308174036, "grad_norm": 4.7697729405684735, "learning_rate": 7.374987372461866e-06, "loss": 0.4012, "step": 5200 }, { "epoch": 0.26312466856897554, "grad_norm": 4.893393941433008, "learning_rate": 7.3699363572078e-06, "loss": 0.4186, "step": 5210 }, { "epoch": 0.26362970632054744, "grad_norm": 3.831293276797052, "learning_rate": 7.364885341953733e-06, "loss": 0.4226, "step": 5220 }, { "epoch": 0.2641347440721194, "grad_norm": 9.39587893739541, "learning_rate": 7.3598343266996675e-06, "loss": 0.4223, "step": 5230 }, { "epoch": 0.2646397818236913, "grad_norm": 2.5508906516086913, "learning_rate": 7.354783311445602e-06, "loss": 0.4204, "step": 5240 }, { "epoch": 0.26514481957526326, "grad_norm": 1.9529316193965132, "learning_rate": 7.349732296191535e-06, "loss": 0.4137, "step": 5250 }, { "epoch": 0.26564985732683516, "grad_norm": 2.2836631555730147, "learning_rate": 7.344681280937469e-06, "loss": 0.4272, "step": 5260 }, { "epoch": 0.2661548950784071, "grad_norm": 2.740870228644112, "learning_rate": 7.339630265683403e-06, "loss": 0.4042, "step": 5270 }, { "epoch": 0.266659932829979, "grad_norm": 2.8335765045366057, "learning_rate": 7.334579250429336e-06, "loss": 0.4042, "step": 5280 }, { "epoch": 0.267164970581551, "grad_norm": 6.021567811733276, "learning_rate": 7.3295282351752705e-06, "loss": 0.4154, "step": 5290 }, { "epoch": 0.2676700083331229, "grad_norm": 3.8900529078388475, "learning_rate": 7.324477219921205e-06, "loss": 0.4066, "step": 5300 }, { "epoch": 0.26817504608469483, "grad_norm": 4.107751019837596, "learning_rate": 7.319426204667138e-06, "loss": 0.4314, "step": 5310 }, { "epoch": 0.26868008383626674, "grad_norm": 3.83489471259606, "learning_rate": 7.314375189413072e-06, "loss": 0.4289, "step": 5320 }, { "epoch": 0.2691851215878387, "grad_norm": 4.1927101461017955, "learning_rate": 7.309324174159007e-06, "loss": 0.4104, "step": 5330 }, { "epoch": 0.2696901593394106, "grad_norm": 4.40200271099351, "learning_rate": 7.304273158904941e-06, "loss": 0.4117, "step": 5340 }, { "epoch": 0.27019519709098255, "grad_norm": 2.5719446012019347, "learning_rate": 7.299222143650874e-06, "loss": 0.4172, "step": 5350 }, { "epoch": 0.27070023484255445, "grad_norm": 3.550169709846441, "learning_rate": 7.2941711283968085e-06, "loss": 0.3961, "step": 5360 }, { "epoch": 0.2712052725941264, "grad_norm": 5.9366850534425994, "learning_rate": 7.289120113142743e-06, "loss": 0.4092, "step": 5370 }, { "epoch": 0.2717103103456983, "grad_norm": 2.2429091940544756, "learning_rate": 7.284069097888676e-06, "loss": 0.3911, "step": 5380 }, { "epoch": 0.27221534809727027, "grad_norm": 3.417557725862848, "learning_rate": 7.27901808263461e-06, "loss": 0.404, "step": 5390 }, { "epoch": 0.27272038584884223, "grad_norm": 2.724964100715556, "learning_rate": 7.273967067380544e-06, "loss": 0.3954, "step": 5400 }, { "epoch": 0.27322542360041413, "grad_norm": 2.856161247602667, "learning_rate": 7.268916052126478e-06, "loss": 0.4166, "step": 5410 }, { "epoch": 0.2737304613519861, "grad_norm": 3.6154589918708724, "learning_rate": 7.2638650368724116e-06, "loss": 0.4194, "step": 5420 }, { "epoch": 0.274235499103558, "grad_norm": 4.796628055616583, "learning_rate": 7.258814021618346e-06, "loss": 0.4047, "step": 5430 }, { "epoch": 0.27474053685512995, "grad_norm": 1.8189441498839218, "learning_rate": 7.25376300636428e-06, "loss": 0.427, "step": 5440 }, { "epoch": 0.27524557460670185, "grad_norm": 7.024511830423675, "learning_rate": 7.248711991110213e-06, "loss": 0.4046, "step": 5450 }, { "epoch": 0.2757506123582738, "grad_norm": 3.5667517176775423, "learning_rate": 7.243660975856147e-06, "loss": 0.4104, "step": 5460 }, { "epoch": 0.2762556501098457, "grad_norm": 2.9842087732377456, "learning_rate": 7.238609960602082e-06, "loss": 0.3865, "step": 5470 }, { "epoch": 0.27676068786141766, "grad_norm": 3.138211814542453, "learning_rate": 7.233558945348016e-06, "loss": 0.421, "step": 5480 }, { "epoch": 0.27726572561298957, "grad_norm": 3.7899763794613603, "learning_rate": 7.22850793009395e-06, "loss": 0.4211, "step": 5490 }, { "epoch": 0.2777707633645615, "grad_norm": 6.302295488602417, "learning_rate": 7.223456914839884e-06, "loss": 0.414, "step": 5500 }, { "epoch": 0.2782758011161334, "grad_norm": 10.35490362594059, "learning_rate": 7.218405899585818e-06, "loss": 0.416, "step": 5510 }, { "epoch": 0.2787808388677054, "grad_norm": 6.310469266434071, "learning_rate": 7.213354884331751e-06, "loss": 0.4064, "step": 5520 }, { "epoch": 0.2792858766192773, "grad_norm": 16.515977602019493, "learning_rate": 7.208303869077685e-06, "loss": 0.4079, "step": 5530 }, { "epoch": 0.27979091437084924, "grad_norm": 4.823535989192853, "learning_rate": 7.203252853823619e-06, "loss": 0.4057, "step": 5540 }, { "epoch": 0.28029595212242114, "grad_norm": 5.510784263870791, "learning_rate": 7.198201838569553e-06, "loss": 0.4037, "step": 5550 }, { "epoch": 0.2808009898739931, "grad_norm": 6.358367429720183, "learning_rate": 7.193150823315487e-06, "loss": 0.4229, "step": 5560 }, { "epoch": 0.281306027625565, "grad_norm": 6.411947776097414, "learning_rate": 7.188099808061421e-06, "loss": 0.4067, "step": 5570 }, { "epoch": 0.28181106537713696, "grad_norm": 7.711569114064788, "learning_rate": 7.183048792807354e-06, "loss": 0.4033, "step": 5580 }, { "epoch": 0.28231610312870886, "grad_norm": 7.577084197751842, "learning_rate": 7.177997777553288e-06, "loss": 0.4193, "step": 5590 }, { "epoch": 0.2828211408802808, "grad_norm": 8.359668853331197, "learning_rate": 7.172946762299223e-06, "loss": 0.4075, "step": 5600 }, { "epoch": 0.2833261786318527, "grad_norm": 3.151405334238333, "learning_rate": 7.167895747045157e-06, "loss": 0.3968, "step": 5610 }, { "epoch": 0.2838312163834247, "grad_norm": 5.148673338465559, "learning_rate": 7.162844731791091e-06, "loss": 0.4084, "step": 5620 }, { "epoch": 0.2843362541349966, "grad_norm": 4.221476129767122, "learning_rate": 7.157793716537025e-06, "loss": 0.4171, "step": 5630 }, { "epoch": 0.28484129188656854, "grad_norm": 13.100880512181337, "learning_rate": 7.152742701282959e-06, "loss": 0.4153, "step": 5640 }, { "epoch": 0.28534632963814044, "grad_norm": 4.5479841682511735, "learning_rate": 7.147691686028892e-06, "loss": 0.4008, "step": 5650 }, { "epoch": 0.2858513673897124, "grad_norm": 12.774978173115935, "learning_rate": 7.142640670774826e-06, "loss": 0.4081, "step": 5660 }, { "epoch": 0.2863564051412843, "grad_norm": 4.74811206992399, "learning_rate": 7.1375896555207604e-06, "loss": 0.4111, "step": 5670 }, { "epoch": 0.28686144289285626, "grad_norm": 6.322992937888857, "learning_rate": 7.132538640266694e-06, "loss": 0.4147, "step": 5680 }, { "epoch": 0.28736648064442816, "grad_norm": 2.3879730307069913, "learning_rate": 7.127487625012628e-06, "loss": 0.413, "step": 5690 }, { "epoch": 0.2878715183960001, "grad_norm": 9.78190300595279, "learning_rate": 7.122436609758562e-06, "loss": 0.4082, "step": 5700 }, { "epoch": 0.288376556147572, "grad_norm": 2.9229120102524546, "learning_rate": 7.117385594504495e-06, "loss": 0.3991, "step": 5710 }, { "epoch": 0.288881593899144, "grad_norm": 3.0629100552949367, "learning_rate": 7.112334579250429e-06, "loss": 0.4219, "step": 5720 }, { "epoch": 0.2893866316507159, "grad_norm": 4.461418623396228, "learning_rate": 7.1072835639963635e-06, "loss": 0.3935, "step": 5730 }, { "epoch": 0.28989166940228783, "grad_norm": 4.158346788895872, "learning_rate": 7.1022325487422985e-06, "loss": 0.4055, "step": 5740 }, { "epoch": 0.29039670715385973, "grad_norm": 4.423297022005039, "learning_rate": 7.097181533488232e-06, "loss": 0.4056, "step": 5750 }, { "epoch": 0.2909017449054317, "grad_norm": 9.748691220694795, "learning_rate": 7.092130518234166e-06, "loss": 0.4163, "step": 5760 }, { "epoch": 0.2914067826570036, "grad_norm": 5.583094724720717, "learning_rate": 7.0870795029801e-06, "loss": 0.404, "step": 5770 }, { "epoch": 0.29191182040857555, "grad_norm": 5.4361132596130375, "learning_rate": 7.082028487726034e-06, "loss": 0.4077, "step": 5780 }, { "epoch": 0.29241685816014745, "grad_norm": 5.4795736369075225, "learning_rate": 7.076977472471967e-06, "loss": 0.385, "step": 5790 }, { "epoch": 0.2929218959117194, "grad_norm": 6.295238304978109, "learning_rate": 7.0719264572179015e-06, "loss": 0.3865, "step": 5800 }, { "epoch": 0.2934269336632913, "grad_norm": 6.992767843962523, "learning_rate": 7.066875441963836e-06, "loss": 0.4016, "step": 5810 }, { "epoch": 0.29393197141486327, "grad_norm": 2.2081931951077762, "learning_rate": 7.061824426709769e-06, "loss": 0.4177, "step": 5820 }, { "epoch": 0.29443700916643517, "grad_norm": 2.6034285969774755, "learning_rate": 7.056773411455703e-06, "loss": 0.4259, "step": 5830 }, { "epoch": 0.29494204691800713, "grad_norm": 3.409449485576021, "learning_rate": 7.051722396201637e-06, "loss": 0.4033, "step": 5840 }, { "epoch": 0.29544708466957903, "grad_norm": 4.0667424482713415, "learning_rate": 7.04667138094757e-06, "loss": 0.3996, "step": 5850 }, { "epoch": 0.295952122421151, "grad_norm": 10.124949019281052, "learning_rate": 7.0416203656935045e-06, "loss": 0.4006, "step": 5860 }, { "epoch": 0.2964571601727229, "grad_norm": 14.286058521059143, "learning_rate": 7.0365693504394395e-06, "loss": 0.4095, "step": 5870 }, { "epoch": 0.29696219792429485, "grad_norm": 6.367565010187276, "learning_rate": 7.031518335185374e-06, "loss": 0.3838, "step": 5880 }, { "epoch": 0.29746723567586675, "grad_norm": 5.583399625565326, "learning_rate": 7.026467319931307e-06, "loss": 0.4117, "step": 5890 }, { "epoch": 0.2979722734274387, "grad_norm": 3.0731137084379534, "learning_rate": 7.021416304677241e-06, "loss": 0.4236, "step": 5900 }, { "epoch": 0.2984773111790106, "grad_norm": 3.0663411110006646, "learning_rate": 7.016365289423175e-06, "loss": 0.4053, "step": 5910 }, { "epoch": 0.29898234893058256, "grad_norm": 2.750574161212685, "learning_rate": 7.0113142741691084e-06, "loss": 0.4068, "step": 5920 }, { "epoch": 0.29948738668215447, "grad_norm": 6.009688968198948, "learning_rate": 7.0062632589150426e-06, "loss": 0.3836, "step": 5930 }, { "epoch": 0.2999924244337264, "grad_norm": 2.9753632251149047, "learning_rate": 7.001212243660977e-06, "loss": 0.4004, "step": 5940 }, { "epoch": 0.3004974621852983, "grad_norm": 4.208610201729938, "learning_rate": 6.99616122840691e-06, "loss": 0.4076, "step": 5950 }, { "epoch": 0.3010024999368703, "grad_norm": 4.7205159244344435, "learning_rate": 6.991110213152844e-06, "loss": 0.4169, "step": 5960 }, { "epoch": 0.3015075376884422, "grad_norm": 5.1888963188216914, "learning_rate": 6.986059197898778e-06, "loss": 0.4131, "step": 5970 }, { "epoch": 0.30201257544001414, "grad_norm": 5.94229402465264, "learning_rate": 6.9810081826447115e-06, "loss": 0.3973, "step": 5980 }, { "epoch": 0.30251761319158604, "grad_norm": 5.525705257458754, "learning_rate": 6.975957167390646e-06, "loss": 0.4055, "step": 5990 }, { "epoch": 0.303022650943158, "grad_norm": 12.459125129727365, "learning_rate": 6.97090615213658e-06, "loss": 0.3971, "step": 6000 }, { "epoch": 0.30352768869472996, "grad_norm": 31.656872346789477, "learning_rate": 6.965855136882515e-06, "loss": 0.407, "step": 6010 }, { "epoch": 0.30403272644630186, "grad_norm": 9.639606695045442, "learning_rate": 6.960804121628448e-06, "loss": 0.3863, "step": 6020 }, { "epoch": 0.3045377641978738, "grad_norm": 7.035557345160716, "learning_rate": 6.955753106374382e-06, "loss": 0.3869, "step": 6030 }, { "epoch": 0.3050428019494457, "grad_norm": 4.9371125965383165, "learning_rate": 6.950702091120316e-06, "loss": 0.3976, "step": 6040 }, { "epoch": 0.3055478397010177, "grad_norm": 3.933897990218482, "learning_rate": 6.9456510758662495e-06, "loss": 0.3897, "step": 6050 }, { "epoch": 0.3060528774525896, "grad_norm": 3.1641621883370337, "learning_rate": 6.940600060612184e-06, "loss": 0.3907, "step": 6060 }, { "epoch": 0.30655791520416154, "grad_norm": 8.063219971247296, "learning_rate": 6.935549045358118e-06, "loss": 0.4031, "step": 6070 }, { "epoch": 0.30706295295573344, "grad_norm": 5.068712111699617, "learning_rate": 6.930498030104051e-06, "loss": 0.3669, "step": 6080 }, { "epoch": 0.3075679907073054, "grad_norm": 2.659761406451919, "learning_rate": 6.925447014849985e-06, "loss": 0.4083, "step": 6090 }, { "epoch": 0.3080730284588773, "grad_norm": 2.711900853734696, "learning_rate": 6.920395999595919e-06, "loss": 0.3857, "step": 6100 }, { "epoch": 0.30857806621044925, "grad_norm": 3.5691023416952015, "learning_rate": 6.9153449843418526e-06, "loss": 0.4075, "step": 6110 }, { "epoch": 0.30908310396202116, "grad_norm": 9.44528498007685, "learning_rate": 6.910293969087787e-06, "loss": 0.3907, "step": 6120 }, { "epoch": 0.3095881417135931, "grad_norm": 6.630952946299743, "learning_rate": 6.905242953833721e-06, "loss": 0.4166, "step": 6130 }, { "epoch": 0.310093179465165, "grad_norm": 2.240750009167658, "learning_rate": 6.900191938579655e-06, "loss": 0.4103, "step": 6140 }, { "epoch": 0.31059821721673697, "grad_norm": 3.4596094945674865, "learning_rate": 6.89514092332559e-06, "loss": 0.3979, "step": 6150 }, { "epoch": 0.3111032549683089, "grad_norm": 3.4198773372663984, "learning_rate": 6.890089908071523e-06, "loss": 0.4131, "step": 6160 }, { "epoch": 0.31160829271988083, "grad_norm": 2.6133323586176163, "learning_rate": 6.885038892817457e-06, "loss": 0.3966, "step": 6170 }, { "epoch": 0.31211333047145273, "grad_norm": 2.7898977693699667, "learning_rate": 6.879987877563391e-06, "loss": 0.4005, "step": 6180 }, { "epoch": 0.3126183682230247, "grad_norm": 15.014756094180136, "learning_rate": 6.874936862309325e-06, "loss": 0.417, "step": 6190 }, { "epoch": 0.3131234059745966, "grad_norm": 3.877839859697261, "learning_rate": 6.869885847055259e-06, "loss": 0.3967, "step": 6200 }, { "epoch": 0.31362844372616855, "grad_norm": 3.642380487638742, "learning_rate": 6.864834831801193e-06, "loss": 0.3872, "step": 6210 }, { "epoch": 0.31413348147774045, "grad_norm": 6.1392539662766055, "learning_rate": 6.859783816547126e-06, "loss": 0.3969, "step": 6220 }, { "epoch": 0.3146385192293124, "grad_norm": 2.8146690736323206, "learning_rate": 6.85473280129306e-06, "loss": 0.4129, "step": 6230 }, { "epoch": 0.3151435569808843, "grad_norm": 4.058997537956916, "learning_rate": 6.8496817860389945e-06, "loss": 0.4015, "step": 6240 }, { "epoch": 0.31564859473245627, "grad_norm": 5.742551004803781, "learning_rate": 6.844630770784928e-06, "loss": 0.3953, "step": 6250 }, { "epoch": 0.31615363248402817, "grad_norm": 15.598787015706185, "learning_rate": 6.839579755530862e-06, "loss": 0.4031, "step": 6260 }, { "epoch": 0.3166586702356001, "grad_norm": 5.020417282923178, "learning_rate": 6.834528740276796e-06, "loss": 0.4035, "step": 6270 }, { "epoch": 0.317163707987172, "grad_norm": 4.667007823216816, "learning_rate": 6.829477725022731e-06, "loss": 0.406, "step": 6280 }, { "epoch": 0.317668745738744, "grad_norm": 7.463699952057543, "learning_rate": 6.824426709768664e-06, "loss": 0.4108, "step": 6290 }, { "epoch": 0.3181737834903159, "grad_norm": 4.262657782964267, "learning_rate": 6.819375694514598e-06, "loss": 0.3974, "step": 6300 }, { "epoch": 0.31867882124188784, "grad_norm": 3.3264387869397862, "learning_rate": 6.8143246792605325e-06, "loss": 0.3952, "step": 6310 }, { "epoch": 0.31918385899345975, "grad_norm": 12.90071142768197, "learning_rate": 6.809273664006466e-06, "loss": 0.3821, "step": 6320 }, { "epoch": 0.3196888967450317, "grad_norm": 6.103897285160824, "learning_rate": 6.8042226487524e-06, "loss": 0.387, "step": 6330 }, { "epoch": 0.3201939344966036, "grad_norm": 5.664255551154975, "learning_rate": 6.799171633498334e-06, "loss": 0.3939, "step": 6340 }, { "epoch": 0.32069897224817556, "grad_norm": 3.9983006490372746, "learning_rate": 6.794120618244267e-06, "loss": 0.4026, "step": 6350 }, { "epoch": 0.32120400999974746, "grad_norm": 4.502616368454671, "learning_rate": 6.789069602990201e-06, "loss": 0.3992, "step": 6360 }, { "epoch": 0.3217090477513194, "grad_norm": 6.834315364764452, "learning_rate": 6.7840185877361355e-06, "loss": 0.4131, "step": 6370 }, { "epoch": 0.3222140855028913, "grad_norm": 6.26718953681543, "learning_rate": 6.778967572482069e-06, "loss": 0.4066, "step": 6380 }, { "epoch": 0.3227191232544633, "grad_norm": 7.341432352989431, "learning_rate": 6.773916557228003e-06, "loss": 0.3883, "step": 6390 }, { "epoch": 0.3232241610060352, "grad_norm": 11.668272167662552, "learning_rate": 6.768865541973937e-06, "loss": 0.415, "step": 6400 }, { "epoch": 0.32372919875760714, "grad_norm": 18.632730593364624, "learning_rate": 6.76381452671987e-06, "loss": 0.4084, "step": 6410 }, { "epoch": 0.32423423650917904, "grad_norm": 9.46350511552533, "learning_rate": 6.758763511465805e-06, "loss": 0.3807, "step": 6420 }, { "epoch": 0.324739274260751, "grad_norm": 4.410820137502103, "learning_rate": 6.7537124962117394e-06, "loss": 0.3844, "step": 6430 }, { "epoch": 0.3252443120123229, "grad_norm": 9.07873183264923, "learning_rate": 6.7486614809576736e-06, "loss": 0.3902, "step": 6440 }, { "epoch": 0.32574934976389486, "grad_norm": 17.65425817976251, "learning_rate": 6.743610465703607e-06, "loss": 0.3902, "step": 6450 }, { "epoch": 0.32625438751546676, "grad_norm": 4.109418188768749, "learning_rate": 6.738559450449541e-06, "loss": 0.391, "step": 6460 }, { "epoch": 0.3267594252670387, "grad_norm": 4.071871347874285, "learning_rate": 6.733508435195475e-06, "loss": 0.4067, "step": 6470 }, { "epoch": 0.3272644630186106, "grad_norm": 4.833923669112235, "learning_rate": 6.728457419941408e-06, "loss": 0.3869, "step": 6480 }, { "epoch": 0.3277695007701826, "grad_norm": 5.226542205736505, "learning_rate": 6.7234064046873425e-06, "loss": 0.3983, "step": 6490 }, { "epoch": 0.3282745385217545, "grad_norm": 14.955066893117705, "learning_rate": 6.718355389433277e-06, "loss": 0.3892, "step": 6500 }, { "epoch": 0.32877957627332643, "grad_norm": 9.826109172231558, "learning_rate": 6.713304374179211e-06, "loss": 0.3849, "step": 6510 }, { "epoch": 0.32928461402489834, "grad_norm": 10.365177617347678, "learning_rate": 6.708253358925144e-06, "loss": 0.3909, "step": 6520 }, { "epoch": 0.3297896517764703, "grad_norm": 3.902511753305759, "learning_rate": 6.703202343671078e-06, "loss": 0.3894, "step": 6530 }, { "epoch": 0.3302946895280422, "grad_norm": 2.5389813606137, "learning_rate": 6.698151328417012e-06, "loss": 0.3947, "step": 6540 }, { "epoch": 0.33079972727961415, "grad_norm": 3.076985557483421, "learning_rate": 6.693100313162947e-06, "loss": 0.385, "step": 6550 }, { "epoch": 0.33130476503118605, "grad_norm": 3.9649750433361164, "learning_rate": 6.6880492979088805e-06, "loss": 0.3904, "step": 6560 }, { "epoch": 0.331809802782758, "grad_norm": 11.994320837772984, "learning_rate": 6.682998282654815e-06, "loss": 0.3983, "step": 6570 }, { "epoch": 0.3323148405343299, "grad_norm": 7.214021116152274, "learning_rate": 6.677947267400749e-06, "loss": 0.3978, "step": 6580 }, { "epoch": 0.33281987828590187, "grad_norm": 2.6026929269364727, "learning_rate": 6.672896252146682e-06, "loss": 0.3775, "step": 6590 }, { "epoch": 0.33332491603747383, "grad_norm": 13.22492686543516, "learning_rate": 6.667845236892616e-06, "loss": 0.3872, "step": 6600 }, { "epoch": 0.33382995378904573, "grad_norm": 3.4887621572574443, "learning_rate": 6.66279422163855e-06, "loss": 0.3961, "step": 6610 }, { "epoch": 0.3343349915406177, "grad_norm": 4.75022907961459, "learning_rate": 6.6577432063844835e-06, "loss": 0.4064, "step": 6620 }, { "epoch": 0.3348400292921896, "grad_norm": 3.0134556897490876, "learning_rate": 6.652692191130418e-06, "loss": 0.4082, "step": 6630 }, { "epoch": 0.33534506704376155, "grad_norm": 6.134839516437559, "learning_rate": 6.647641175876352e-06, "loss": 0.3807, "step": 6640 }, { "epoch": 0.33585010479533345, "grad_norm": 2.967962385112729, "learning_rate": 6.642590160622285e-06, "loss": 0.389, "step": 6650 }, { "epoch": 0.3363551425469054, "grad_norm": 2.530280284171402, "learning_rate": 6.637539145368219e-06, "loss": 0.3966, "step": 6660 }, { "epoch": 0.3368601802984773, "grad_norm": 5.8276867252853455, "learning_rate": 6.632488130114153e-06, "loss": 0.3899, "step": 6670 }, { "epoch": 0.33736521805004926, "grad_norm": 11.32729799857443, "learning_rate": 6.627437114860087e-06, "loss": 0.3977, "step": 6680 }, { "epoch": 0.33787025580162117, "grad_norm": 4.636057747530336, "learning_rate": 6.6223860996060216e-06, "loss": 0.3879, "step": 6690 }, { "epoch": 0.3383752935531931, "grad_norm": 3.644150608144144, "learning_rate": 6.617335084351956e-06, "loss": 0.4062, "step": 6700 }, { "epoch": 0.338880331304765, "grad_norm": 9.355537955116619, "learning_rate": 6.61228406909789e-06, "loss": 0.3866, "step": 6710 }, { "epoch": 0.339385369056337, "grad_norm": 2.4262554363736517, "learning_rate": 6.607233053843823e-06, "loss": 0.4244, "step": 6720 }, { "epoch": 0.3398904068079089, "grad_norm": 4.581445110742895, "learning_rate": 6.602182038589757e-06, "loss": 0.3843, "step": 6730 }, { "epoch": 0.34039544455948084, "grad_norm": 2.4362996437893902, "learning_rate": 6.597131023335691e-06, "loss": 0.3948, "step": 6740 }, { "epoch": 0.34090048231105274, "grad_norm": 4.360969619561423, "learning_rate": 6.592080008081625e-06, "loss": 0.3861, "step": 6750 }, { "epoch": 0.3414055200626247, "grad_norm": 3.2345867404100423, "learning_rate": 6.587028992827559e-06, "loss": 0.422, "step": 6760 }, { "epoch": 0.3419105578141966, "grad_norm": 5.310935386991515, "learning_rate": 6.581977977573493e-06, "loss": 0.4137, "step": 6770 }, { "epoch": 0.34241559556576856, "grad_norm": 10.95834322260054, "learning_rate": 6.576926962319426e-06, "loss": 0.4002, "step": 6780 }, { "epoch": 0.34292063331734046, "grad_norm": 7.178923820806059, "learning_rate": 6.57187594706536e-06, "loss": 0.3771, "step": 6790 }, { "epoch": 0.3434256710689124, "grad_norm": 16.529256071560802, "learning_rate": 6.566824931811294e-06, "loss": 0.4129, "step": 6800 }, { "epoch": 0.3439307088204843, "grad_norm": 3.673940050614789, "learning_rate": 6.561773916557228e-06, "loss": 0.4034, "step": 6810 }, { "epoch": 0.3444357465720563, "grad_norm": 2.9280965882270404, "learning_rate": 6.556722901303163e-06, "loss": 0.3892, "step": 6820 }, { "epoch": 0.3449407843236282, "grad_norm": 93.05414347131021, "learning_rate": 6.551671886049097e-06, "loss": 0.4061, "step": 6830 }, { "epoch": 0.34544582207520014, "grad_norm": 4.709615772022943, "learning_rate": 6.546620870795031e-06, "loss": 0.3948, "step": 6840 }, { "epoch": 0.34595085982677204, "grad_norm": 4.852654627656968, "learning_rate": 6.541569855540965e-06, "loss": 0.3938, "step": 6850 }, { "epoch": 0.346455897578344, "grad_norm": 4.816390805815548, "learning_rate": 6.536518840286898e-06, "loss": 0.3792, "step": 6860 }, { "epoch": 0.3469609353299159, "grad_norm": 3.6013276651698374, "learning_rate": 6.531467825032832e-06, "loss": 0.4012, "step": 6870 }, { "epoch": 0.34746597308148786, "grad_norm": 3.4921009820598576, "learning_rate": 6.5264168097787665e-06, "loss": 0.4083, "step": 6880 }, { "epoch": 0.34797101083305976, "grad_norm": 6.61814343879125, "learning_rate": 6.5213657945247e-06, "loss": 0.4049, "step": 6890 }, { "epoch": 0.3484760485846317, "grad_norm": 4.052623448201924, "learning_rate": 6.516314779270634e-06, "loss": 0.388, "step": 6900 }, { "epoch": 0.3489810863362036, "grad_norm": 3.63955593340056, "learning_rate": 6.511263764016568e-06, "loss": 0.4061, "step": 6910 }, { "epoch": 0.3494861240877756, "grad_norm": 4.150263872578294, "learning_rate": 6.506212748762501e-06, "loss": 0.4054, "step": 6920 }, { "epoch": 0.3499911618393475, "grad_norm": 7.266970150410544, "learning_rate": 6.5011617335084354e-06, "loss": 0.3793, "step": 6930 }, { "epoch": 0.35049619959091943, "grad_norm": 7.712101909778269, "learning_rate": 6.4961107182543696e-06, "loss": 0.3994, "step": 6940 }, { "epoch": 0.35100123734249133, "grad_norm": 5.509148276281905, "learning_rate": 6.491059703000303e-06, "loss": 0.3781, "step": 6950 }, { "epoch": 0.3515062750940633, "grad_norm": 2.589095673859192, "learning_rate": 6.486008687746238e-06, "loss": 0.3978, "step": 6960 }, { "epoch": 0.3520113128456352, "grad_norm": 4.122842581672093, "learning_rate": 6.480957672492172e-06, "loss": 0.4009, "step": 6970 }, { "epoch": 0.35251635059720715, "grad_norm": 4.1842800217172265, "learning_rate": 6.475906657238106e-06, "loss": 0.4029, "step": 6980 }, { "epoch": 0.35302138834877905, "grad_norm": 7.398626485810365, "learning_rate": 6.470855641984039e-06, "loss": 0.4078, "step": 6990 }, { "epoch": 0.353526426100351, "grad_norm": 5.396774144943605, "learning_rate": 6.4658046267299735e-06, "loss": 0.4059, "step": 7000 }, { "epoch": 0.3540314638519229, "grad_norm": 13.281878929001232, "learning_rate": 6.460753611475908e-06, "loss": 0.4024, "step": 7010 }, { "epoch": 0.35453650160349487, "grad_norm": 4.283305275547664, "learning_rate": 6.455702596221841e-06, "loss": 0.4108, "step": 7020 }, { "epoch": 0.35504153935506677, "grad_norm": 3.8392385244226794, "learning_rate": 6.450651580967775e-06, "loss": 0.4227, "step": 7030 }, { "epoch": 0.35554657710663873, "grad_norm": 5.012629063346216, "learning_rate": 6.445600565713709e-06, "loss": 0.4071, "step": 7040 }, { "epoch": 0.35605161485821063, "grad_norm": 13.322782620701876, "learning_rate": 6.440549550459642e-06, "loss": 0.3926, "step": 7050 }, { "epoch": 0.3565566526097826, "grad_norm": 5.149647425656023, "learning_rate": 6.4354985352055765e-06, "loss": 0.4017, "step": 7060 }, { "epoch": 0.3570616903613545, "grad_norm": 7.5875779832186065, "learning_rate": 6.430447519951511e-06, "loss": 0.3867, "step": 7070 }, { "epoch": 0.35756672811292645, "grad_norm": 3.433655269776608, "learning_rate": 6.425396504697444e-06, "loss": 0.3788, "step": 7080 }, { "epoch": 0.35807176586449835, "grad_norm": 4.332715392495331, "learning_rate": 6.420345489443378e-06, "loss": 0.3672, "step": 7090 }, { "epoch": 0.3585768036160703, "grad_norm": 8.825136973366781, "learning_rate": 6.415294474189313e-06, "loss": 0.391, "step": 7100 }, { "epoch": 0.3590818413676422, "grad_norm": 4.896756858317604, "learning_rate": 6.410243458935247e-06, "loss": 0.3981, "step": 7110 }, { "epoch": 0.35958687911921416, "grad_norm": 3.0181643910861595, "learning_rate": 6.40519244368118e-06, "loss": 0.3998, "step": 7120 }, { "epoch": 0.36009191687078607, "grad_norm": 2.5429084560847346, "learning_rate": 6.4001414284271145e-06, "loss": 0.3973, "step": 7130 }, { "epoch": 0.360596954622358, "grad_norm": 4.518411589704886, "learning_rate": 6.395090413173049e-06, "loss": 0.3921, "step": 7140 }, { "epoch": 0.3611019923739299, "grad_norm": 2.520647494438761, "learning_rate": 6.390039397918982e-06, "loss": 0.3838, "step": 7150 }, { "epoch": 0.3616070301255019, "grad_norm": 5.45309379468731, "learning_rate": 6.384988382664916e-06, "loss": 0.3966, "step": 7160 }, { "epoch": 0.3621120678770738, "grad_norm": 5.822220380763848, "learning_rate": 6.37993736741085e-06, "loss": 0.4041, "step": 7170 }, { "epoch": 0.36261710562864574, "grad_norm": 4.920376454170573, "learning_rate": 6.3748863521567835e-06, "loss": 0.3994, "step": 7180 }, { "epoch": 0.3631221433802177, "grad_norm": 2.5736259947121707, "learning_rate": 6.369835336902718e-06, "loss": 0.3944, "step": 7190 }, { "epoch": 0.3636271811317896, "grad_norm": 2.749872482951078, "learning_rate": 6.364784321648652e-06, "loss": 0.3985, "step": 7200 }, { "epoch": 0.36413221888336156, "grad_norm": 2.5194888325972267, "learning_rate": 6.359733306394586e-06, "loss": 0.3939, "step": 7210 }, { "epoch": 0.36463725663493346, "grad_norm": 3.574666468822497, "learning_rate": 6.354682291140519e-06, "loss": 0.4234, "step": 7220 }, { "epoch": 0.3651422943865054, "grad_norm": 3.0608012516572325, "learning_rate": 6.349631275886454e-06, "loss": 0.3894, "step": 7230 }, { "epoch": 0.3656473321380773, "grad_norm": 2.888576087403178, "learning_rate": 6.344580260632388e-06, "loss": 0.4149, "step": 7240 }, { "epoch": 0.3661523698896493, "grad_norm": 2.6426553837219378, "learning_rate": 6.339529245378322e-06, "loss": 0.3788, "step": 7250 }, { "epoch": 0.3666574076412212, "grad_norm": 8.704787479012706, "learning_rate": 6.334478230124256e-06, "loss": 0.4047, "step": 7260 }, { "epoch": 0.36716244539279314, "grad_norm": 2.497691163861801, "learning_rate": 6.32942721487019e-06, "loss": 0.3959, "step": 7270 }, { "epoch": 0.36766748314436504, "grad_norm": 2.8835395006733306, "learning_rate": 6.324376199616124e-06, "loss": 0.3964, "step": 7280 }, { "epoch": 0.368172520895937, "grad_norm": 4.011809451507155, "learning_rate": 6.319325184362057e-06, "loss": 0.4196, "step": 7290 }, { "epoch": 0.3686775586475089, "grad_norm": 9.138568921825629, "learning_rate": 6.314274169107991e-06, "loss": 0.4075, "step": 7300 }, { "epoch": 0.36918259639908085, "grad_norm": 2.6137236844527676, "learning_rate": 6.309223153853925e-06, "loss": 0.3967, "step": 7310 }, { "epoch": 0.36968763415065276, "grad_norm": 5.521296938399183, "learning_rate": 6.304172138599859e-06, "loss": 0.4005, "step": 7320 }, { "epoch": 0.3701926719022247, "grad_norm": 7.608318289570054, "learning_rate": 6.299121123345793e-06, "loss": 0.4103, "step": 7330 }, { "epoch": 0.3706977096537966, "grad_norm": 3.952351490468356, "learning_rate": 6.294070108091727e-06, "loss": 0.3955, "step": 7340 }, { "epoch": 0.37120274740536857, "grad_norm": 3.2275756395096367, "learning_rate": 6.28901909283766e-06, "loss": 0.3691, "step": 7350 }, { "epoch": 0.3717077851569405, "grad_norm": 5.1929124366289185, "learning_rate": 6.283968077583594e-06, "loss": 0.4109, "step": 7360 }, { "epoch": 0.37221282290851243, "grad_norm": 3.7102445761239147, "learning_rate": 6.278917062329529e-06, "loss": 0.3855, "step": 7370 }, { "epoch": 0.37271786066008433, "grad_norm": 4.102719494934126, "learning_rate": 6.273866047075463e-06, "loss": 0.4085, "step": 7380 }, { "epoch": 0.3732228984116563, "grad_norm": 2.882775662800612, "learning_rate": 6.268815031821397e-06, "loss": 0.3949, "step": 7390 }, { "epoch": 0.3737279361632282, "grad_norm": 3.847808178042115, "learning_rate": 6.263764016567331e-06, "loss": 0.3893, "step": 7400 }, { "epoch": 0.37423297391480015, "grad_norm": 11.984324118193372, "learning_rate": 6.258713001313265e-06, "loss": 0.376, "step": 7410 }, { "epoch": 0.37473801166637205, "grad_norm": 3.3001953512413897, "learning_rate": 6.253661986059198e-06, "loss": 0.3823, "step": 7420 }, { "epoch": 0.375243049417944, "grad_norm": 5.373513774406177, "learning_rate": 6.248610970805132e-06, "loss": 0.4099, "step": 7430 }, { "epoch": 0.3757480871695159, "grad_norm": 2.1585330518152492, "learning_rate": 6.2435599555510664e-06, "loss": 0.3977, "step": 7440 }, { "epoch": 0.37625312492108787, "grad_norm": 5.400683846446516, "learning_rate": 6.238508940297e-06, "loss": 0.3917, "step": 7450 }, { "epoch": 0.37675816267265977, "grad_norm": 11.58546846012599, "learning_rate": 6.233457925042934e-06, "loss": 0.4104, "step": 7460 }, { "epoch": 0.3772632004242317, "grad_norm": 6.158451538704526, "learning_rate": 6.228406909788868e-06, "loss": 0.3817, "step": 7470 }, { "epoch": 0.37776823817580363, "grad_norm": 3.220415459261749, "learning_rate": 6.223355894534801e-06, "loss": 0.4143, "step": 7480 }, { "epoch": 0.3782732759273756, "grad_norm": 6.483927128344251, "learning_rate": 6.218304879280735e-06, "loss": 0.4009, "step": 7490 }, { "epoch": 0.3787783136789475, "grad_norm": 3.1374527379369628, "learning_rate": 6.21325386402667e-06, "loss": 0.3825, "step": 7500 }, { "epoch": 0.37928335143051944, "grad_norm": 1.9293391844723216, "learning_rate": 6.2082028487726045e-06, "loss": 0.3986, "step": 7510 }, { "epoch": 0.37978838918209135, "grad_norm": 2.8612302362457704, "learning_rate": 6.203151833518538e-06, "loss": 0.4093, "step": 7520 }, { "epoch": 0.3802934269336633, "grad_norm": 27.52073154576333, "learning_rate": 6.198100818264472e-06, "loss": 0.41, "step": 7530 }, { "epoch": 0.3807984646852352, "grad_norm": 18.495429795467157, "learning_rate": 6.193049803010406e-06, "loss": 0.3918, "step": 7540 }, { "epoch": 0.38130350243680716, "grad_norm": 1.9149364107527063, "learning_rate": 6.187998787756339e-06, "loss": 0.4059, "step": 7550 }, { "epoch": 0.38180854018837906, "grad_norm": 3.265575132967205, "learning_rate": 6.182947772502273e-06, "loss": 0.3761, "step": 7560 }, { "epoch": 0.382313577939951, "grad_norm": 2.9358960635943485, "learning_rate": 6.1778967572482075e-06, "loss": 0.3868, "step": 7570 }, { "epoch": 0.3828186156915229, "grad_norm": 2.7564424046901324, "learning_rate": 6.172845741994142e-06, "loss": 0.4058, "step": 7580 }, { "epoch": 0.3833236534430949, "grad_norm": 2.2542661022048374, "learning_rate": 6.167794726740075e-06, "loss": 0.3878, "step": 7590 }, { "epoch": 0.3838286911946668, "grad_norm": 3.185426579737794, "learning_rate": 6.162743711486009e-06, "loss": 0.399, "step": 7600 }, { "epoch": 0.38433372894623874, "grad_norm": 3.3739552822928665, "learning_rate": 6.157692696231943e-06, "loss": 0.3915, "step": 7610 }, { "epoch": 0.38483876669781064, "grad_norm": 6.412123283698192, "learning_rate": 6.1526416809778764e-06, "loss": 0.3815, "step": 7620 }, { "epoch": 0.3853438044493826, "grad_norm": 5.5629296211963215, "learning_rate": 6.1475906657238105e-06, "loss": 0.3794, "step": 7630 }, { "epoch": 0.3858488422009545, "grad_norm": 3.130873953540236, "learning_rate": 6.1425396504697455e-06, "loss": 0.3971, "step": 7640 }, { "epoch": 0.38635387995252646, "grad_norm": 2.92441715259481, "learning_rate": 6.13748863521568e-06, "loss": 0.396, "step": 7650 }, { "epoch": 0.38685891770409836, "grad_norm": 4.577909010601426, "learning_rate": 6.132437619961613e-06, "loss": 0.4041, "step": 7660 }, { "epoch": 0.3873639554556703, "grad_norm": 6.930598411493494, "learning_rate": 6.127386604707547e-06, "loss": 0.4031, "step": 7670 }, { "epoch": 0.3878689932072422, "grad_norm": 3.4417274152302513, "learning_rate": 6.122335589453481e-06, "loss": 0.3806, "step": 7680 }, { "epoch": 0.3883740309588142, "grad_norm": 8.637005031113738, "learning_rate": 6.1172845741994144e-06, "loss": 0.3847, "step": 7690 }, { "epoch": 0.3888790687103861, "grad_norm": 5.039168163562101, "learning_rate": 6.1122335589453486e-06, "loss": 0.3927, "step": 7700 }, { "epoch": 0.38938410646195803, "grad_norm": 2.3470164311901307, "learning_rate": 6.107182543691283e-06, "loss": 0.38, "step": 7710 }, { "epoch": 0.38988914421352994, "grad_norm": 10.576642650070218, "learning_rate": 6.102131528437216e-06, "loss": 0.3792, "step": 7720 }, { "epoch": 0.3903941819651019, "grad_norm": 4.420199483186855, "learning_rate": 6.09708051318315e-06, "loss": 0.4023, "step": 7730 }, { "epoch": 0.3908992197166738, "grad_norm": 4.194934818506613, "learning_rate": 6.092029497929084e-06, "loss": 0.3889, "step": 7740 }, { "epoch": 0.39140425746824575, "grad_norm": 4.111966133004914, "learning_rate": 6.0869784826750175e-06, "loss": 0.4035, "step": 7750 }, { "epoch": 0.39190929521981765, "grad_norm": 2.451588743941197, "learning_rate": 6.081927467420952e-06, "loss": 0.3713, "step": 7760 }, { "epoch": 0.3924143329713896, "grad_norm": 4.679648209446588, "learning_rate": 6.076876452166886e-06, "loss": 0.3841, "step": 7770 }, { "epoch": 0.3929193707229615, "grad_norm": 5.530044736355814, "learning_rate": 6.071825436912821e-06, "loss": 0.3775, "step": 7780 }, { "epoch": 0.39342440847453347, "grad_norm": 8.103071297854475, "learning_rate": 6.066774421658754e-06, "loss": 0.3977, "step": 7790 }, { "epoch": 0.39392944622610543, "grad_norm": 3.4887947505414787, "learning_rate": 6.061723406404688e-06, "loss": 0.3855, "step": 7800 }, { "epoch": 0.39443448397767733, "grad_norm": 8.590868528737689, "learning_rate": 6.056672391150622e-06, "loss": 0.4039, "step": 7810 }, { "epoch": 0.3949395217292493, "grad_norm": 6.459621411276858, "learning_rate": 6.0516213758965555e-06, "loss": 0.3877, "step": 7820 }, { "epoch": 0.3954445594808212, "grad_norm": 5.585759607891893, "learning_rate": 6.04657036064249e-06, "loss": 0.382, "step": 7830 }, { "epoch": 0.39594959723239315, "grad_norm": 13.909241772822636, "learning_rate": 6.041519345388424e-06, "loss": 0.3604, "step": 7840 }, { "epoch": 0.39645463498396505, "grad_norm": 8.283993010778675, "learning_rate": 6.036468330134357e-06, "loss": 0.3892, "step": 7850 }, { "epoch": 0.396959672735537, "grad_norm": 5.670024832457869, "learning_rate": 6.031417314880291e-06, "loss": 0.3758, "step": 7860 }, { "epoch": 0.3974647104871089, "grad_norm": 6.589956544879753, "learning_rate": 6.026366299626225e-06, "loss": 0.3788, "step": 7870 }, { "epoch": 0.39796974823868086, "grad_norm": 38.93534106041679, "learning_rate": 6.0213152843721586e-06, "loss": 0.3959, "step": 7880 }, { "epoch": 0.39847478599025277, "grad_norm": 9.349039394830367, "learning_rate": 6.016264269118093e-06, "loss": 0.4026, "step": 7890 }, { "epoch": 0.3989798237418247, "grad_norm": 12.651279564480014, "learning_rate": 6.011213253864027e-06, "loss": 0.3953, "step": 7900 }, { "epoch": 0.3994848614933966, "grad_norm": 12.369720860667183, "learning_rate": 6.006162238609962e-06, "loss": 0.3834, "step": 7910 }, { "epoch": 0.3999898992449686, "grad_norm": 6.935402086209793, "learning_rate": 6.001111223355895e-06, "loss": 0.3855, "step": 7920 }, { "epoch": 0.4004949369965405, "grad_norm": 14.485519276092187, "learning_rate": 5.996060208101829e-06, "loss": 0.3794, "step": 7930 }, { "epoch": 0.40099997474811244, "grad_norm": 7.8583919362498795, "learning_rate": 5.991009192847763e-06, "loss": 0.3922, "step": 7940 }, { "epoch": 0.40150501249968434, "grad_norm": 11.298566008159142, "learning_rate": 5.985958177593697e-06, "loss": 0.4002, "step": 7950 }, { "epoch": 0.4020100502512563, "grad_norm": 28.291667710287033, "learning_rate": 5.980907162339631e-06, "loss": 0.3621, "step": 7960 }, { "epoch": 0.4025150880028282, "grad_norm": 17.877462639004314, "learning_rate": 5.975856147085565e-06, "loss": 0.3968, "step": 7970 }, { "epoch": 0.40302012575440016, "grad_norm": 8.068693573421475, "learning_rate": 5.970805131831499e-06, "loss": 0.3795, "step": 7980 }, { "epoch": 0.40352516350597206, "grad_norm": 6.199946065309128, "learning_rate": 5.965754116577432e-06, "loss": 0.3909, "step": 7990 }, { "epoch": 0.404030201257544, "grad_norm": 13.7274262284846, "learning_rate": 5.960703101323366e-06, "loss": 0.3728, "step": 8000 }, { "epoch": 0.4045352390091159, "grad_norm": 15.37764496794447, "learning_rate": 5.9556520860693005e-06, "loss": 0.3868, "step": 8010 }, { "epoch": 0.4050402767606879, "grad_norm": 7.104894136071469, "learning_rate": 5.950601070815234e-06, "loss": 0.3869, "step": 8020 }, { "epoch": 0.4055453145122598, "grad_norm": 22.085557636681344, "learning_rate": 5.945550055561168e-06, "loss": 0.3842, "step": 8030 }, { "epoch": 0.40605035226383174, "grad_norm": 8.557783879131767, "learning_rate": 5.940499040307102e-06, "loss": 0.3823, "step": 8040 }, { "epoch": 0.40655539001540364, "grad_norm": 19.231211160885596, "learning_rate": 5.935448025053037e-06, "loss": 0.3819, "step": 8050 }, { "epoch": 0.4070604277669756, "grad_norm": 7.452519753759336, "learning_rate": 5.93039700979897e-06, "loss": 0.3789, "step": 8060 }, { "epoch": 0.4075654655185475, "grad_norm": 25.150570679742312, "learning_rate": 5.925345994544904e-06, "loss": 0.3829, "step": 8070 }, { "epoch": 0.40807050327011946, "grad_norm": 12.1446176511635, "learning_rate": 5.9202949792908385e-06, "loss": 0.3925, "step": 8080 }, { "epoch": 0.40857554102169136, "grad_norm": 8.119469151619079, "learning_rate": 5.915243964036772e-06, "loss": 0.3857, "step": 8090 }, { "epoch": 0.4090805787732633, "grad_norm": 8.948591507408596, "learning_rate": 5.910192948782706e-06, "loss": 0.3757, "step": 8100 }, { "epoch": 0.4095856165248352, "grad_norm": 4.621733850994045, "learning_rate": 5.90514193352864e-06, "loss": 0.4093, "step": 8110 }, { "epoch": 0.4100906542764072, "grad_norm": 4.153858890970974, "learning_rate": 5.900090918274573e-06, "loss": 0.3834, "step": 8120 }, { "epoch": 0.4105956920279791, "grad_norm": 7.250782877265542, "learning_rate": 5.895039903020507e-06, "loss": 0.3914, "step": 8130 }, { "epoch": 0.41110072977955103, "grad_norm": 7.214997449416598, "learning_rate": 5.8899888877664415e-06, "loss": 0.3763, "step": 8140 }, { "epoch": 0.41160576753112293, "grad_norm": 7.292312503512012, "learning_rate": 5.884937872512375e-06, "loss": 0.3696, "step": 8150 }, { "epoch": 0.4121108052826949, "grad_norm": 8.564034765070186, "learning_rate": 5.879886857258309e-06, "loss": 0.36, "step": 8160 }, { "epoch": 0.4126158430342668, "grad_norm": 5.03381059488658, "learning_rate": 5.874835842004243e-06, "loss": 0.3915, "step": 8170 }, { "epoch": 0.41312088078583875, "grad_norm": 7.1010748327814275, "learning_rate": 5.869784826750178e-06, "loss": 0.3831, "step": 8180 }, { "epoch": 0.41362591853741065, "grad_norm": 3.8185356541714177, "learning_rate": 5.864733811496111e-06, "loss": 0.3821, "step": 8190 }, { "epoch": 0.4141309562889826, "grad_norm": 15.840372845269206, "learning_rate": 5.8596827962420454e-06, "loss": 0.3703, "step": 8200 }, { "epoch": 0.4146359940405545, "grad_norm": 8.469089513751111, "learning_rate": 5.8546317809879796e-06, "loss": 0.3672, "step": 8210 }, { "epoch": 0.41514103179212647, "grad_norm": 5.590802338147382, "learning_rate": 5.849580765733913e-06, "loss": 0.3893, "step": 8220 }, { "epoch": 0.41564606954369837, "grad_norm": 8.673788237261087, "learning_rate": 5.844529750479847e-06, "loss": 0.3802, "step": 8230 }, { "epoch": 0.41615110729527033, "grad_norm": 4.9263977139934, "learning_rate": 5.839478735225781e-06, "loss": 0.3699, "step": 8240 }, { "epoch": 0.41665614504684223, "grad_norm": 2.3618077978561134, "learning_rate": 5.834427719971714e-06, "loss": 0.4169, "step": 8250 }, { "epoch": 0.4171611827984142, "grad_norm": 3.4421082504609215, "learning_rate": 5.8293767047176485e-06, "loss": 0.3895, "step": 8260 }, { "epoch": 0.4176662205499861, "grad_norm": 9.036063807272322, "learning_rate": 5.824325689463583e-06, "loss": 0.3911, "step": 8270 }, { "epoch": 0.41817125830155805, "grad_norm": 2.726905279515538, "learning_rate": 5.819274674209517e-06, "loss": 0.3906, "step": 8280 }, { "epoch": 0.41867629605312995, "grad_norm": 3.1988219571320173, "learning_rate": 5.81422365895545e-06, "loss": 0.3864, "step": 8290 }, { "epoch": 0.4191813338047019, "grad_norm": 3.45185619640904, "learning_rate": 5.809172643701384e-06, "loss": 0.3985, "step": 8300 }, { "epoch": 0.4196863715562738, "grad_norm": 5.02491265508067, "learning_rate": 5.804121628447318e-06, "loss": 0.3907, "step": 8310 }, { "epoch": 0.42019140930784576, "grad_norm": 3.5658556661908443, "learning_rate": 5.799070613193253e-06, "loss": 0.3923, "step": 8320 }, { "epoch": 0.42069644705941767, "grad_norm": 5.978581813826559, "learning_rate": 5.7940195979391865e-06, "loss": 0.3844, "step": 8330 }, { "epoch": 0.4212014848109896, "grad_norm": 8.14240158938725, "learning_rate": 5.788968582685121e-06, "loss": 0.3978, "step": 8340 }, { "epoch": 0.4217065225625615, "grad_norm": 3.181846621853647, "learning_rate": 5.783917567431055e-06, "loss": 0.3992, "step": 8350 }, { "epoch": 0.4222115603141335, "grad_norm": 8.322284252076804, "learning_rate": 5.778866552176988e-06, "loss": 0.3836, "step": 8360 }, { "epoch": 0.4227165980657054, "grad_norm": 4.76585783468973, "learning_rate": 5.773815536922922e-06, "loss": 0.3904, "step": 8370 }, { "epoch": 0.42322163581727734, "grad_norm": 3.857044578534056, "learning_rate": 5.768764521668856e-06, "loss": 0.3851, "step": 8380 }, { "epoch": 0.4237266735688493, "grad_norm": 5.058592219879147, "learning_rate": 5.7637135064147895e-06, "loss": 0.3867, "step": 8390 }, { "epoch": 0.4242317113204212, "grad_norm": 15.788479310500968, "learning_rate": 5.758662491160724e-06, "loss": 0.3876, "step": 8400 }, { "epoch": 0.42473674907199316, "grad_norm": 14.796150435899415, "learning_rate": 5.753611475906658e-06, "loss": 0.3713, "step": 8410 }, { "epoch": 0.42524178682356506, "grad_norm": 6.198372860755761, "learning_rate": 5.748560460652591e-06, "loss": 0.3959, "step": 8420 }, { "epoch": 0.425746824575137, "grad_norm": 4.004395808383159, "learning_rate": 5.743509445398525e-06, "loss": 0.3879, "step": 8430 }, { "epoch": 0.4262518623267089, "grad_norm": 24.753829770708524, "learning_rate": 5.738458430144459e-06, "loss": 0.3907, "step": 8440 }, { "epoch": 0.4267569000782809, "grad_norm": 3.79416616415335, "learning_rate": 5.733407414890394e-06, "loss": 0.3716, "step": 8450 }, { "epoch": 0.4272619378298528, "grad_norm": 16.793962044251266, "learning_rate": 5.7283563996363276e-06, "loss": 0.38, "step": 8460 }, { "epoch": 0.42776697558142474, "grad_norm": 6.987194101161238, "learning_rate": 5.723305384382262e-06, "loss": 0.3741, "step": 8470 }, { "epoch": 0.42827201333299664, "grad_norm": 6.1003803215690295, "learning_rate": 5.718254369128196e-06, "loss": 0.3836, "step": 8480 }, { "epoch": 0.4287770510845686, "grad_norm": 11.39191383765078, "learning_rate": 5.713203353874129e-06, "loss": 0.3785, "step": 8490 }, { "epoch": 0.4292820888361405, "grad_norm": 4.216231578608531, "learning_rate": 5.708152338620063e-06, "loss": 0.3849, "step": 8500 }, { "epoch": 0.42978712658771245, "grad_norm": 26.748832800632403, "learning_rate": 5.703101323365997e-06, "loss": 0.3877, "step": 8510 }, { "epoch": 0.43029216433928436, "grad_norm": 5.3822939480121414, "learning_rate": 5.698050308111931e-06, "loss": 0.3692, "step": 8520 }, { "epoch": 0.4307972020908563, "grad_norm": 4.249521852101111, "learning_rate": 5.692999292857865e-06, "loss": 0.3817, "step": 8530 }, { "epoch": 0.4313022398424282, "grad_norm": 6.698259673877979, "learning_rate": 5.687948277603799e-06, "loss": 0.3881, "step": 8540 }, { "epoch": 0.43180727759400017, "grad_norm": 6.493588412853115, "learning_rate": 5.682897262349732e-06, "loss": 0.3977, "step": 8550 }, { "epoch": 0.4323123153455721, "grad_norm": 3.733362569921758, "learning_rate": 5.677846247095666e-06, "loss": 0.3982, "step": 8560 }, { "epoch": 0.43281735309714403, "grad_norm": 4.847622020933204, "learning_rate": 5.6727952318416e-06, "loss": 0.3769, "step": 8570 }, { "epoch": 0.43332239084871593, "grad_norm": 7.5594667479754785, "learning_rate": 5.667744216587534e-06, "loss": 0.3907, "step": 8580 }, { "epoch": 0.4338274286002879, "grad_norm": 3.201682379163367, "learning_rate": 5.662693201333469e-06, "loss": 0.3729, "step": 8590 }, { "epoch": 0.4343324663518598, "grad_norm": 2.5878076554319898, "learning_rate": 5.657642186079403e-06, "loss": 0.3956, "step": 8600 }, { "epoch": 0.43483750410343175, "grad_norm": 17.488965325778878, "learning_rate": 5.652591170825337e-06, "loss": 0.3712, "step": 8610 }, { "epoch": 0.43534254185500365, "grad_norm": 4.116982393940921, "learning_rate": 5.64754015557127e-06, "loss": 0.3893, "step": 8620 }, { "epoch": 0.4358475796065756, "grad_norm": 3.9148601756933554, "learning_rate": 5.642489140317204e-06, "loss": 0.3887, "step": 8630 }, { "epoch": 0.4363526173581475, "grad_norm": 7.764634135356721, "learning_rate": 5.637438125063138e-06, "loss": 0.3776, "step": 8640 }, { "epoch": 0.43685765510971947, "grad_norm": 9.506827681627305, "learning_rate": 5.6323871098090725e-06, "loss": 0.3989, "step": 8650 }, { "epoch": 0.43736269286129137, "grad_norm": 3.2832190526050242, "learning_rate": 5.627336094555006e-06, "loss": 0.3982, "step": 8660 }, { "epoch": 0.4378677306128633, "grad_norm": 4.082534248878624, "learning_rate": 5.62228507930094e-06, "loss": 0.3901, "step": 8670 }, { "epoch": 0.43837276836443523, "grad_norm": 3.077297177968144, "learning_rate": 5.617234064046874e-06, "loss": 0.3669, "step": 8680 }, { "epoch": 0.4388778061160072, "grad_norm": 2.9330241633656167, "learning_rate": 5.612183048792807e-06, "loss": 0.3934, "step": 8690 }, { "epoch": 0.4393828438675791, "grad_norm": 3.3633206477485222, "learning_rate": 5.6071320335387414e-06, "loss": 0.3949, "step": 8700 }, { "epoch": 0.43988788161915104, "grad_norm": 4.3033520111069, "learning_rate": 5.6020810182846756e-06, "loss": 0.3989, "step": 8710 }, { "epoch": 0.44039291937072295, "grad_norm": 3.106225393468076, "learning_rate": 5.597030003030609e-06, "loss": 0.3827, "step": 8720 }, { "epoch": 0.4408979571222949, "grad_norm": 3.245685369854172, "learning_rate": 5.591978987776544e-06, "loss": 0.3892, "step": 8730 }, { "epoch": 0.4414029948738668, "grad_norm": 3.381863989802099, "learning_rate": 5.586927972522478e-06, "loss": 0.3945, "step": 8740 }, { "epoch": 0.44190803262543876, "grad_norm": 5.237035603972838, "learning_rate": 5.581876957268412e-06, "loss": 0.3852, "step": 8750 }, { "epoch": 0.44241307037701066, "grad_norm": 3.927311917890188, "learning_rate": 5.576825942014345e-06, "loss": 0.396, "step": 8760 }, { "epoch": 0.4429181081285826, "grad_norm": 3.9833756210512634, "learning_rate": 5.5717749267602795e-06, "loss": 0.3823, "step": 8770 }, { "epoch": 0.4434231458801545, "grad_norm": 4.82672597736031, "learning_rate": 5.566723911506214e-06, "loss": 0.3837, "step": 8780 }, { "epoch": 0.4439281836317265, "grad_norm": 6.6207510733703, "learning_rate": 5.561672896252147e-06, "loss": 0.3846, "step": 8790 }, { "epoch": 0.4444332213832984, "grad_norm": 4.761029989854409, "learning_rate": 5.556621880998081e-06, "loss": 0.3851, "step": 8800 }, { "epoch": 0.44493825913487034, "grad_norm": 7.583147217262274, "learning_rate": 5.551570865744015e-06, "loss": 0.3897, "step": 8810 }, { "epoch": 0.44544329688644224, "grad_norm": 8.267756155680992, "learning_rate": 5.546519850489948e-06, "loss": 0.3924, "step": 8820 }, { "epoch": 0.4459483346380142, "grad_norm": 10.81298502824958, "learning_rate": 5.5414688352358825e-06, "loss": 0.3768, "step": 8830 }, { "epoch": 0.4464533723895861, "grad_norm": 5.44259386052073, "learning_rate": 5.536417819981817e-06, "loss": 0.3766, "step": 8840 }, { "epoch": 0.44695841014115806, "grad_norm": 19.848594487097966, "learning_rate": 5.53136680472775e-06, "loss": 0.386, "step": 8850 }, { "epoch": 0.44746344789272996, "grad_norm": 9.264967854622206, "learning_rate": 5.526315789473685e-06, "loss": 0.3916, "step": 8860 }, { "epoch": 0.4479684856443019, "grad_norm": 11.970151482632554, "learning_rate": 5.521264774219619e-06, "loss": 0.3878, "step": 8870 }, { "epoch": 0.4484735233958738, "grad_norm": 4.562463656013562, "learning_rate": 5.516213758965553e-06, "loss": 0.3827, "step": 8880 }, { "epoch": 0.4489785611474458, "grad_norm": 8.92273565377152, "learning_rate": 5.511162743711486e-06, "loss": 0.3691, "step": 8890 }, { "epoch": 0.4494835988990177, "grad_norm": 6.168748754085141, "learning_rate": 5.5061117284574205e-06, "loss": 0.3762, "step": 8900 }, { "epoch": 0.44998863665058964, "grad_norm": 9.45298676374925, "learning_rate": 5.501060713203355e-06, "loss": 0.39, "step": 8910 }, { "epoch": 0.45049367440216154, "grad_norm": 8.9825721118566, "learning_rate": 5.496009697949288e-06, "loss": 0.3946, "step": 8920 }, { "epoch": 0.4509987121537335, "grad_norm": 3.970319391221007, "learning_rate": 5.490958682695222e-06, "loss": 0.3766, "step": 8930 }, { "epoch": 0.4515037499053054, "grad_norm": 10.786278133368974, "learning_rate": 5.485907667441156e-06, "loss": 0.3609, "step": 8940 }, { "epoch": 0.45200878765687735, "grad_norm": 10.415656893523584, "learning_rate": 5.4808566521870895e-06, "loss": 0.381, "step": 8950 }, { "epoch": 0.45251382540844926, "grad_norm": 6.642215363932757, "learning_rate": 5.475805636933024e-06, "loss": 0.3958, "step": 8960 }, { "epoch": 0.4530188631600212, "grad_norm": 23.15078550027285, "learning_rate": 5.470754621678958e-06, "loss": 0.3765, "step": 8970 }, { "epoch": 0.4535239009115931, "grad_norm": 12.962039757228773, "learning_rate": 5.465703606424891e-06, "loss": 0.3796, "step": 8980 }, { "epoch": 0.45402893866316507, "grad_norm": 10.834056396167355, "learning_rate": 5.460652591170825e-06, "loss": 0.3825, "step": 8990 }, { "epoch": 0.45453397641473703, "grad_norm": 31.830300939594174, "learning_rate": 5.45560157591676e-06, "loss": 0.3799, "step": 9000 }, { "epoch": 0.45503901416630893, "grad_norm": 7.317578019104587, "learning_rate": 5.450550560662694e-06, "loss": 0.3617, "step": 9010 }, { "epoch": 0.4555440519178809, "grad_norm": 18.51705064335606, "learning_rate": 5.445499545408628e-06, "loss": 0.3843, "step": 9020 }, { "epoch": 0.4560490896694528, "grad_norm": 12.73637775470279, "learning_rate": 5.440448530154562e-06, "loss": 0.3533, "step": 9030 }, { "epoch": 0.45655412742102475, "grad_norm": 13.089783277969067, "learning_rate": 5.435397514900496e-06, "loss": 0.3844, "step": 9040 }, { "epoch": 0.45705916517259665, "grad_norm": 38.50162494773606, "learning_rate": 5.43034649964643e-06, "loss": 0.3839, "step": 9050 }, { "epoch": 0.4575642029241686, "grad_norm": 5.6808406753592715, "learning_rate": 5.425295484392363e-06, "loss": 0.3938, "step": 9060 }, { "epoch": 0.4580692406757405, "grad_norm": 9.303424412707237, "learning_rate": 5.420244469138297e-06, "loss": 0.3864, "step": 9070 }, { "epoch": 0.45857427842731247, "grad_norm": 6.58855670966557, "learning_rate": 5.415193453884231e-06, "loss": 0.3802, "step": 9080 }, { "epoch": 0.45907931617888437, "grad_norm": 7.626250897081635, "learning_rate": 5.410142438630165e-06, "loss": 0.3779, "step": 9090 }, { "epoch": 0.4595843539304563, "grad_norm": 7.20955512967183, "learning_rate": 5.405091423376099e-06, "loss": 0.3852, "step": 9100 }, { "epoch": 0.4600893916820282, "grad_norm": 3.600490312288401, "learning_rate": 5.400040408122033e-06, "loss": 0.3734, "step": 9110 }, { "epoch": 0.4605944294336002, "grad_norm": 8.58899517905405, "learning_rate": 5.394989392867966e-06, "loss": 0.3789, "step": 9120 }, { "epoch": 0.4610994671851721, "grad_norm": 6.98087393231808, "learning_rate": 5.389938377613901e-06, "loss": 0.3833, "step": 9130 }, { "epoch": 0.46160450493674404, "grad_norm": 4.3141382007799765, "learning_rate": 5.384887362359835e-06, "loss": 0.3772, "step": 9140 }, { "epoch": 0.46210954268831594, "grad_norm": 4.9903497470267615, "learning_rate": 5.379836347105769e-06, "loss": 0.3777, "step": 9150 }, { "epoch": 0.4626145804398879, "grad_norm": 5.928389544490131, "learning_rate": 5.374785331851703e-06, "loss": 0.3862, "step": 9160 }, { "epoch": 0.4631196181914598, "grad_norm": 4.797470596868366, "learning_rate": 5.369734316597637e-06, "loss": 0.3997, "step": 9170 }, { "epoch": 0.46362465594303176, "grad_norm": 8.670905903895585, "learning_rate": 5.364683301343571e-06, "loss": 0.3678, "step": 9180 }, { "epoch": 0.46412969369460366, "grad_norm": 6.464493728216097, "learning_rate": 5.359632286089504e-06, "loss": 0.3815, "step": 9190 }, { "epoch": 0.4646347314461756, "grad_norm": 12.803983058209836, "learning_rate": 5.354581270835438e-06, "loss": 0.38, "step": 9200 }, { "epoch": 0.4651397691977475, "grad_norm": 9.24913498804073, "learning_rate": 5.3495302555813724e-06, "loss": 0.3818, "step": 9210 }, { "epoch": 0.4656448069493195, "grad_norm": 6.124104683054, "learning_rate": 5.344479240327306e-06, "loss": 0.3857, "step": 9220 }, { "epoch": 0.4661498447008914, "grad_norm": 19.982398897538843, "learning_rate": 5.33942822507324e-06, "loss": 0.3759, "step": 9230 }, { "epoch": 0.46665488245246334, "grad_norm": 6.143149352411427, "learning_rate": 5.334377209819174e-06, "loss": 0.3901, "step": 9240 }, { "epoch": 0.46715992020403524, "grad_norm": 11.010467552273573, "learning_rate": 5.329326194565107e-06, "loss": 0.3798, "step": 9250 }, { "epoch": 0.4676649579556072, "grad_norm": 5.615638251464779, "learning_rate": 5.324275179311041e-06, "loss": 0.3855, "step": 9260 }, { "epoch": 0.4681699957071791, "grad_norm": 5.10238166508234, "learning_rate": 5.319224164056976e-06, "loss": 0.3793, "step": 9270 }, { "epoch": 0.46867503345875106, "grad_norm": 11.264038165848222, "learning_rate": 5.3141731488029105e-06, "loss": 0.3912, "step": 9280 }, { "epoch": 0.46918007121032296, "grad_norm": 4.638158847573904, "learning_rate": 5.309122133548844e-06, "loss": 0.3874, "step": 9290 }, { "epoch": 0.4696851089618949, "grad_norm": 5.40563230905546, "learning_rate": 5.304071118294778e-06, "loss": 0.3688, "step": 9300 }, { "epoch": 0.4701901467134668, "grad_norm": 7.336455357223306, "learning_rate": 5.299020103040712e-06, "loss": 0.3595, "step": 9310 }, { "epoch": 0.4706951844650388, "grad_norm": 6.604255767738522, "learning_rate": 5.293969087786645e-06, "loss": 0.3851, "step": 9320 }, { "epoch": 0.4712002222166107, "grad_norm": 8.434342768902814, "learning_rate": 5.288918072532579e-06, "loss": 0.3752, "step": 9330 }, { "epoch": 0.47170525996818263, "grad_norm": 7.991264568372306, "learning_rate": 5.2838670572785135e-06, "loss": 0.3847, "step": 9340 }, { "epoch": 0.47221029771975453, "grad_norm": 7.689424953061599, "learning_rate": 5.278816042024447e-06, "loss": 0.381, "step": 9350 }, { "epoch": 0.4727153354713265, "grad_norm": 8.92289056823504, "learning_rate": 5.273765026770381e-06, "loss": 0.3766, "step": 9360 }, { "epoch": 0.4732203732228984, "grad_norm": 6.798146146074497, "learning_rate": 5.268714011516315e-06, "loss": 0.3973, "step": 9370 }, { "epoch": 0.47372541097447035, "grad_norm": 4.0943212163341105, "learning_rate": 5.263662996262249e-06, "loss": 0.3852, "step": 9380 }, { "epoch": 0.47423044872604225, "grad_norm": 7.923369250973975, "learning_rate": 5.2586119810081824e-06, "loss": 0.3759, "step": 9390 }, { "epoch": 0.4747354864776142, "grad_norm": 6.958844060305958, "learning_rate": 5.253560965754117e-06, "loss": 0.3814, "step": 9400 }, { "epoch": 0.4752405242291861, "grad_norm": 5.637050541114169, "learning_rate": 5.2485099505000515e-06, "loss": 0.3665, "step": 9410 }, { "epoch": 0.47574556198075807, "grad_norm": 7.801257727546048, "learning_rate": 5.243458935245986e-06, "loss": 0.3744, "step": 9420 }, { "epoch": 0.47625059973232997, "grad_norm": 8.37919084606113, "learning_rate": 5.238407919991919e-06, "loss": 0.394, "step": 9430 }, { "epoch": 0.47675563748390193, "grad_norm": 4.463876576531971, "learning_rate": 5.233356904737853e-06, "loss": 0.3712, "step": 9440 }, { "epoch": 0.47726067523547383, "grad_norm": 3.5151581874917253, "learning_rate": 5.228305889483787e-06, "loss": 0.3733, "step": 9450 }, { "epoch": 0.4777657129870458, "grad_norm": 6.525584094885654, "learning_rate": 5.2232548742297204e-06, "loss": 0.3769, "step": 9460 }, { "epoch": 0.4782707507386177, "grad_norm": 3.5568609941598344, "learning_rate": 5.2182038589756546e-06, "loss": 0.3902, "step": 9470 }, { "epoch": 0.47877578849018965, "grad_norm": 4.842870672953731, "learning_rate": 5.213152843721589e-06, "loss": 0.3794, "step": 9480 }, { "epoch": 0.47928082624176155, "grad_norm": 6.265357359911366, "learning_rate": 5.208101828467522e-06, "loss": 0.3803, "step": 9490 }, { "epoch": 0.4797858639933335, "grad_norm": 8.175660778042406, "learning_rate": 5.203050813213456e-06, "loss": 0.3835, "step": 9500 }, { "epoch": 0.4802909017449054, "grad_norm": 2.9195122634550112, "learning_rate": 5.19799979795939e-06, "loss": 0.3813, "step": 9510 }, { "epoch": 0.48079593949647736, "grad_norm": 3.632448992408266, "learning_rate": 5.1929487827053235e-06, "loss": 0.3684, "step": 9520 }, { "epoch": 0.48130097724804927, "grad_norm": 4.0935522105259095, "learning_rate": 5.187897767451258e-06, "loss": 0.4025, "step": 9530 }, { "epoch": 0.4818060149996212, "grad_norm": 2.3275681028787005, "learning_rate": 5.182846752197193e-06, "loss": 0.388, "step": 9540 }, { "epoch": 0.4823110527511931, "grad_norm": 2.0956548268446458, "learning_rate": 5.177795736943127e-06, "loss": 0.4031, "step": 9550 }, { "epoch": 0.4828160905027651, "grad_norm": 2.173446663356269, "learning_rate": 5.17274472168906e-06, "loss": 0.3984, "step": 9560 }, { "epoch": 0.483321128254337, "grad_norm": 3.2812553226056713, "learning_rate": 5.167693706434994e-06, "loss": 0.384, "step": 9570 }, { "epoch": 0.48382616600590894, "grad_norm": 3.590239737946557, "learning_rate": 5.162642691180928e-06, "loss": 0.3877, "step": 9580 }, { "epoch": 0.4843312037574809, "grad_norm": 2.83308197947474, "learning_rate": 5.1575916759268615e-06, "loss": 0.3934, "step": 9590 }, { "epoch": 0.4848362415090528, "grad_norm": 3.847792584135844, "learning_rate": 5.152540660672796e-06, "loss": 0.3735, "step": 9600 }, { "epoch": 0.48534127926062476, "grad_norm": 20.75585092309958, "learning_rate": 5.14748964541873e-06, "loss": 0.3788, "step": 9610 }, { "epoch": 0.48584631701219666, "grad_norm": 4.31507443188382, "learning_rate": 5.142438630164663e-06, "loss": 0.4073, "step": 9620 }, { "epoch": 0.4863513547637686, "grad_norm": 2.675763121628948, "learning_rate": 5.137387614910597e-06, "loss": 0.3853, "step": 9630 }, { "epoch": 0.4868563925153405, "grad_norm": 3.180499977172799, "learning_rate": 5.132336599656531e-06, "loss": 0.3933, "step": 9640 }, { "epoch": 0.4873614302669125, "grad_norm": 3.2780129127069633, "learning_rate": 5.1272855844024646e-06, "loss": 0.3862, "step": 9650 }, { "epoch": 0.4878664680184844, "grad_norm": 2.650097724922774, "learning_rate": 5.122234569148399e-06, "loss": 0.3769, "step": 9660 }, { "epoch": 0.48837150577005634, "grad_norm": 19.664449194387533, "learning_rate": 5.117183553894333e-06, "loss": 0.3909, "step": 9670 }, { "epoch": 0.48887654352162824, "grad_norm": 3.055574678383631, "learning_rate": 5.112132538640268e-06, "loss": 0.368, "step": 9680 }, { "epoch": 0.4893815812732002, "grad_norm": 4.1275772825798285, "learning_rate": 5.107081523386201e-06, "loss": 0.3807, "step": 9690 }, { "epoch": 0.4898866190247721, "grad_norm": 2.680370704403155, "learning_rate": 5.102030508132135e-06, "loss": 0.3852, "step": 9700 }, { "epoch": 0.49039165677634405, "grad_norm": 5.918612399643231, "learning_rate": 5.096979492878069e-06, "loss": 0.3838, "step": 9710 }, { "epoch": 0.49089669452791596, "grad_norm": 3.682371592006304, "learning_rate": 5.091928477624003e-06, "loss": 0.4054, "step": 9720 }, { "epoch": 0.4914017322794879, "grad_norm": 14.462676090252211, "learning_rate": 5.086877462369937e-06, "loss": 0.3992, "step": 9730 }, { "epoch": 0.4919067700310598, "grad_norm": 8.666562939588516, "learning_rate": 5.081826447115871e-06, "loss": 0.3942, "step": 9740 }, { "epoch": 0.49241180778263177, "grad_norm": 8.758850115026672, "learning_rate": 5.076775431861805e-06, "loss": 0.3895, "step": 9750 }, { "epoch": 0.4929168455342037, "grad_norm": 4.418838276589228, "learning_rate": 5.071724416607738e-06, "loss": 0.3696, "step": 9760 }, { "epoch": 0.49342188328577563, "grad_norm": 3.2634112325138016, "learning_rate": 5.066673401353672e-06, "loss": 0.392, "step": 9770 }, { "epoch": 0.49392692103734753, "grad_norm": 9.038919625118139, "learning_rate": 5.0616223860996065e-06, "loss": 0.3763, "step": 9780 }, { "epoch": 0.4944319587889195, "grad_norm": 9.874288914817955, "learning_rate": 5.05657137084554e-06, "loss": 0.3749, "step": 9790 }, { "epoch": 0.4949369965404914, "grad_norm": 4.177942828246587, "learning_rate": 5.051520355591474e-06, "loss": 0.3803, "step": 9800 }, { "epoch": 0.49544203429206335, "grad_norm": 5.209048638874338, "learning_rate": 5.046469340337409e-06, "loss": 0.377, "step": 9810 }, { "epoch": 0.49594707204363525, "grad_norm": 6.559103955892935, "learning_rate": 5.041418325083343e-06, "loss": 0.3779, "step": 9820 }, { "epoch": 0.4964521097952072, "grad_norm": 4.815427953732501, "learning_rate": 5.036367309829276e-06, "loss": 0.3832, "step": 9830 }, { "epoch": 0.4969571475467791, "grad_norm": 4.801884846686052, "learning_rate": 5.03131629457521e-06, "loss": 0.3648, "step": 9840 }, { "epoch": 0.49746218529835107, "grad_norm": 4.707895499699569, "learning_rate": 5.0262652793211445e-06, "loss": 0.3974, "step": 9850 }, { "epoch": 0.49796722304992297, "grad_norm": 3.1632503133911176, "learning_rate": 5.021214264067078e-06, "loss": 0.3645, "step": 9860 }, { "epoch": 0.4984722608014949, "grad_norm": 3.187410732261863, "learning_rate": 5.016163248813012e-06, "loss": 0.3964, "step": 9870 }, { "epoch": 0.49897729855306683, "grad_norm": 18.87501682400254, "learning_rate": 5.011112233558946e-06, "loss": 0.394, "step": 9880 }, { "epoch": 0.4994823363046388, "grad_norm": 2.727776305600166, "learning_rate": 5.006061218304879e-06, "loss": 0.384, "step": 9890 }, { "epoch": 0.4999873740562107, "grad_norm": 2.835732786228447, "learning_rate": 5.001010203050813e-06, "loss": 0.3826, "step": 9900 }, { "epoch": 0.5004924118077826, "grad_norm": 12.721805859040408, "learning_rate": 4.9959591877967475e-06, "loss": 0.3965, "step": 9910 }, { "epoch": 0.5009974495593545, "grad_norm": 4.309473567917906, "learning_rate": 4.990908172542682e-06, "loss": 0.3867, "step": 9920 }, { "epoch": 0.5015024873109265, "grad_norm": 4.385559429438772, "learning_rate": 4.985857157288616e-06, "loss": 0.3819, "step": 9930 }, { "epoch": 0.5020075250624985, "grad_norm": 3.1179751000194917, "learning_rate": 4.980806142034549e-06, "loss": 0.3678, "step": 9940 }, { "epoch": 0.5025125628140703, "grad_norm": 6.661473319925103, "learning_rate": 4.975755126780483e-06, "loss": 0.3803, "step": 9950 }, { "epoch": 0.5030176005656423, "grad_norm": 5.422867145087551, "learning_rate": 4.970704111526417e-06, "loss": 0.3724, "step": 9960 }, { "epoch": 0.5035226383172142, "grad_norm": 5.116363457400816, "learning_rate": 4.965653096272351e-06, "loss": 0.3796, "step": 9970 }, { "epoch": 0.5040276760687862, "grad_norm": 4.36781162605322, "learning_rate": 4.9606020810182856e-06, "loss": 0.3758, "step": 9980 }, { "epoch": 0.504532713820358, "grad_norm": 4.280820474874582, "learning_rate": 4.955551065764219e-06, "loss": 0.3747, "step": 9990 }, { "epoch": 0.50503775157193, "grad_norm": 57.44106262644066, "learning_rate": 4.950500050510153e-06, "loss": 0.371, "step": 10000 }, { "epoch": 0.5055427893235019, "grad_norm": 4.841001034679017, "learning_rate": 4.945449035256087e-06, "loss": 0.3779, "step": 10010 }, { "epoch": 0.5060478270750739, "grad_norm": 11.797410635025113, "learning_rate": 4.94039802000202e-06, "loss": 0.3683, "step": 10020 }, { "epoch": 0.5065528648266457, "grad_norm": 4.75455698304815, "learning_rate": 4.9353470047479545e-06, "loss": 0.3705, "step": 10030 }, { "epoch": 0.5070579025782177, "grad_norm": 4.360031532937485, "learning_rate": 4.930295989493889e-06, "loss": 0.398, "step": 10040 }, { "epoch": 0.5075629403297897, "grad_norm": 5.827246838677423, "learning_rate": 4.925244974239823e-06, "loss": 0.3841, "step": 10050 }, { "epoch": 0.5080679780813616, "grad_norm": 7.5891846258257765, "learning_rate": 4.920193958985757e-06, "loss": 0.3744, "step": 10060 }, { "epoch": 0.5085730158329335, "grad_norm": 5.9635081800997805, "learning_rate": 4.91514294373169e-06, "loss": 0.3986, "step": 10070 }, { "epoch": 0.5090780535845054, "grad_norm": 5.484063460889767, "learning_rate": 4.910091928477624e-06, "loss": 0.3839, "step": 10080 }, { "epoch": 0.5095830913360774, "grad_norm": 5.174164528404758, "learning_rate": 4.905040913223558e-06, "loss": 0.3619, "step": 10090 }, { "epoch": 0.5100881290876493, "grad_norm": 2.90936959880962, "learning_rate": 4.8999898979694925e-06, "loss": 0.3756, "step": 10100 }, { "epoch": 0.5105931668392212, "grad_norm": 4.3229203248018715, "learning_rate": 4.894938882715426e-06, "loss": 0.3832, "step": 10110 }, { "epoch": 0.5110982045907931, "grad_norm": 2.500902066389639, "learning_rate": 4.889887867461361e-06, "loss": 0.3826, "step": 10120 }, { "epoch": 0.5116032423423651, "grad_norm": 8.290016106139367, "learning_rate": 4.884836852207294e-06, "loss": 0.386, "step": 10130 }, { "epoch": 0.512108280093937, "grad_norm": 2.6008214460714894, "learning_rate": 4.879785836953228e-06, "loss": 0.3676, "step": 10140 }, { "epoch": 0.5126133178455089, "grad_norm": 2.1333825641801902, "learning_rate": 4.874734821699162e-06, "loss": 0.3898, "step": 10150 }, { "epoch": 0.5131183555970809, "grad_norm": 3.013718964658717, "learning_rate": 4.8696838064450955e-06, "loss": 0.3887, "step": 10160 }, { "epoch": 0.5136233933486528, "grad_norm": 3.6467351317483816, "learning_rate": 4.86463279119103e-06, "loss": 0.3782, "step": 10170 }, { "epoch": 0.5141284311002248, "grad_norm": 2.950522126672218, "learning_rate": 4.859581775936964e-06, "loss": 0.377, "step": 10180 }, { "epoch": 0.5146334688517967, "grad_norm": 2.252426318052208, "learning_rate": 4.854530760682898e-06, "loss": 0.3758, "step": 10190 }, { "epoch": 0.5151385066033686, "grad_norm": 4.755817616459457, "learning_rate": 4.849479745428832e-06, "loss": 0.3807, "step": 10200 }, { "epoch": 0.5156435443549405, "grad_norm": 4.046138186931513, "learning_rate": 4.844428730174765e-06, "loss": 0.391, "step": 10210 }, { "epoch": 0.5161485821065125, "grad_norm": 19.99100500952114, "learning_rate": 4.8393777149206994e-06, "loss": 0.3781, "step": 10220 }, { "epoch": 0.5166536198580844, "grad_norm": 5.479232805622522, "learning_rate": 4.8343266996666336e-06, "loss": 0.3708, "step": 10230 }, { "epoch": 0.5171586576096563, "grad_norm": 8.844870574151285, "learning_rate": 4.829275684412567e-06, "loss": 0.3711, "step": 10240 }, { "epoch": 0.5176636953612282, "grad_norm": 7.57710951045164, "learning_rate": 4.824224669158502e-06, "loss": 0.3854, "step": 10250 }, { "epoch": 0.5181687331128002, "grad_norm": 7.850442473809185, "learning_rate": 4.819173653904435e-06, "loss": 0.3786, "step": 10260 }, { "epoch": 0.5186737708643722, "grad_norm": 6.316522286246354, "learning_rate": 4.814122638650369e-06, "loss": 0.3838, "step": 10270 }, { "epoch": 0.519178808615944, "grad_norm": 11.182819849962994, "learning_rate": 4.809071623396303e-06, "loss": 0.385, "step": 10280 }, { "epoch": 0.519683846367516, "grad_norm": 9.46133568238808, "learning_rate": 4.804020608142237e-06, "loss": 0.3703, "step": 10290 }, { "epoch": 0.5201888841190879, "grad_norm": 7.691807268341042, "learning_rate": 4.798969592888171e-06, "loss": 0.3682, "step": 10300 }, { "epoch": 0.5206939218706599, "grad_norm": 9.93078233486807, "learning_rate": 4.793918577634105e-06, "loss": 0.3802, "step": 10310 }, { "epoch": 0.5211989596222317, "grad_norm": 7.510658702823232, "learning_rate": 4.788867562380039e-06, "loss": 0.3886, "step": 10320 }, { "epoch": 0.5217039973738037, "grad_norm": 22.882646270297737, "learning_rate": 4.783816547125973e-06, "loss": 0.393, "step": 10330 }, { "epoch": 0.5222090351253756, "grad_norm": 14.517964204994225, "learning_rate": 4.778765531871906e-06, "loss": 0.3803, "step": 10340 }, { "epoch": 0.5227140728769476, "grad_norm": 6.854534356382802, "learning_rate": 4.7737145166178405e-06, "loss": 0.3931, "step": 10350 }, { "epoch": 0.5232191106285194, "grad_norm": 11.309739741500756, "learning_rate": 4.768663501363775e-06, "loss": 0.3876, "step": 10360 }, { "epoch": 0.5237241483800914, "grad_norm": 8.35728833434225, "learning_rate": 4.763612486109708e-06, "loss": 0.3789, "step": 10370 }, { "epoch": 0.5242291861316634, "grad_norm": 8.871743836416062, "learning_rate": 4.758561470855642e-06, "loss": 0.3981, "step": 10380 }, { "epoch": 0.5247342238832353, "grad_norm": 15.15117759530781, "learning_rate": 4.753510455601576e-06, "loss": 0.3577, "step": 10390 }, { "epoch": 0.5252392616348072, "grad_norm": 10.766312797504908, "learning_rate": 4.74845944034751e-06, "loss": 0.3744, "step": 10400 }, { "epoch": 0.5257442993863791, "grad_norm": 44.757871998844294, "learning_rate": 4.743408425093444e-06, "loss": 0.3697, "step": 10410 }, { "epoch": 0.5262493371379511, "grad_norm": 16.586045391427703, "learning_rate": 4.738357409839378e-06, "loss": 0.3746, "step": 10420 }, { "epoch": 0.526754374889523, "grad_norm": 50.773287263268436, "learning_rate": 4.733306394585312e-06, "loss": 0.373, "step": 10430 }, { "epoch": 0.5272594126410949, "grad_norm": 10.502847442362183, "learning_rate": 4.728255379331246e-06, "loss": 0.387, "step": 10440 }, { "epoch": 0.5277644503926668, "grad_norm": 16.528099312938107, "learning_rate": 4.72320436407718e-06, "loss": 0.383, "step": 10450 }, { "epoch": 0.5282694881442388, "grad_norm": 17.8799111844064, "learning_rate": 4.718153348823114e-06, "loss": 0.381, "step": 10460 }, { "epoch": 0.5287745258958108, "grad_norm": 14.63753428467757, "learning_rate": 4.713102333569048e-06, "loss": 0.3837, "step": 10470 }, { "epoch": 0.5292795636473826, "grad_norm": 13.42251301006677, "learning_rate": 4.7080513183149816e-06, "loss": 0.3595, "step": 10480 }, { "epoch": 0.5297846013989546, "grad_norm": 30.157056141735794, "learning_rate": 4.703000303060916e-06, "loss": 0.3648, "step": 10490 }, { "epoch": 0.5302896391505265, "grad_norm": 28.508594444052306, "learning_rate": 4.69794928780685e-06, "loss": 0.3793, "step": 10500 }, { "epoch": 0.5307946769020985, "grad_norm": 28.922781791221336, "learning_rate": 4.692898272552783e-06, "loss": 0.3745, "step": 10510 }, { "epoch": 0.5312997146536703, "grad_norm": 13.721901669008831, "learning_rate": 4.687847257298718e-06, "loss": 0.375, "step": 10520 }, { "epoch": 0.5318047524052423, "grad_norm": 65.55837391568407, "learning_rate": 4.682796242044651e-06, "loss": 0.3801, "step": 10530 }, { "epoch": 0.5323097901568142, "grad_norm": 16.351443187824426, "learning_rate": 4.6777452267905855e-06, "loss": 0.3727, "step": 10540 }, { "epoch": 0.5328148279083862, "grad_norm": 24.727000388948138, "learning_rate": 4.67269421153652e-06, "loss": 0.4, "step": 10550 }, { "epoch": 0.533319865659958, "grad_norm": 16.92930292363904, "learning_rate": 4.667643196282453e-06, "loss": 0.3818, "step": 10560 }, { "epoch": 0.53382490341153, "grad_norm": 8.293298262780807, "learning_rate": 4.662592181028387e-06, "loss": 0.3914, "step": 10570 }, { "epoch": 0.534329941163102, "grad_norm": 7.923596292500085, "learning_rate": 4.657541165774321e-06, "loss": 0.3835, "step": 10580 }, { "epoch": 0.5348349789146739, "grad_norm": 21.525089803513207, "learning_rate": 4.652490150520255e-06, "loss": 0.3621, "step": 10590 }, { "epoch": 0.5353400166662458, "grad_norm": 5.744657346480749, "learning_rate": 4.647439135266189e-06, "loss": 0.3674, "step": 10600 }, { "epoch": 0.5358450544178177, "grad_norm": 4.54613701131844, "learning_rate": 4.642388120012123e-06, "loss": 0.3804, "step": 10610 }, { "epoch": 0.5363500921693897, "grad_norm": 14.97113129892423, "learning_rate": 4.637337104758057e-06, "loss": 0.3555, "step": 10620 }, { "epoch": 0.5368551299209616, "grad_norm": 11.639674507484868, "learning_rate": 4.632286089503991e-06, "loss": 0.3632, "step": 10630 }, { "epoch": 0.5373601676725335, "grad_norm": 19.897945155280457, "learning_rate": 4.627235074249924e-06, "loss": 0.3747, "step": 10640 }, { "epoch": 0.5378652054241054, "grad_norm": 6.364339652707885, "learning_rate": 4.622184058995858e-06, "loss": 0.3786, "step": 10650 }, { "epoch": 0.5383702431756774, "grad_norm": 9.457676241588533, "learning_rate": 4.617133043741792e-06, "loss": 0.3756, "step": 10660 }, { "epoch": 0.5388752809272493, "grad_norm": 44.15827262329639, "learning_rate": 4.6120820284877265e-06, "loss": 0.3741, "step": 10670 }, { "epoch": 0.5393803186788212, "grad_norm": 17.038339271477003, "learning_rate": 4.607031013233661e-06, "loss": 0.3736, "step": 10680 }, { "epoch": 0.5398853564303931, "grad_norm": 7.117484724149919, "learning_rate": 4.601979997979594e-06, "loss": 0.3723, "step": 10690 }, { "epoch": 0.5403903941819651, "grad_norm": 42.36231217164134, "learning_rate": 4.596928982725528e-06, "loss": 0.3729, "step": 10700 }, { "epoch": 0.5408954319335371, "grad_norm": 8.564465700855026, "learning_rate": 4.591877967471462e-06, "loss": 0.3789, "step": 10710 }, { "epoch": 0.5414004696851089, "grad_norm": 8.67428139976207, "learning_rate": 4.5868269522173955e-06, "loss": 0.3713, "step": 10720 }, { "epoch": 0.5419055074366809, "grad_norm": 3.39326548400569, "learning_rate": 4.5817759369633304e-06, "loss": 0.385, "step": 10730 }, { "epoch": 0.5424105451882528, "grad_norm": 8.005255858360202, "learning_rate": 4.576724921709264e-06, "loss": 0.3707, "step": 10740 }, { "epoch": 0.5429155829398248, "grad_norm": 5.061235443799371, "learning_rate": 4.571673906455198e-06, "loss": 0.3685, "step": 10750 }, { "epoch": 0.5434206206913966, "grad_norm": 3.7186572910401816, "learning_rate": 4.566622891201132e-06, "loss": 0.3793, "step": 10760 }, { "epoch": 0.5439256584429686, "grad_norm": 3.941979325788329, "learning_rate": 4.561571875947065e-06, "loss": 0.3723, "step": 10770 }, { "epoch": 0.5444306961945405, "grad_norm": 6.68200128954829, "learning_rate": 4.556520860692999e-06, "loss": 0.3699, "step": 10780 }, { "epoch": 0.5449357339461125, "grad_norm": 3.8742413460943204, "learning_rate": 4.5514698454389335e-06, "loss": 0.3712, "step": 10790 }, { "epoch": 0.5454407716976845, "grad_norm": 3.5527678739232327, "learning_rate": 4.546418830184868e-06, "loss": 0.3797, "step": 10800 }, { "epoch": 0.5459458094492563, "grad_norm": 3.293145624427697, "learning_rate": 4.541367814930802e-06, "loss": 0.392, "step": 10810 }, { "epoch": 0.5464508472008283, "grad_norm": 43.93745381251672, "learning_rate": 4.536316799676736e-06, "loss": 0.3848, "step": 10820 }, { "epoch": 0.5469558849524002, "grad_norm": 3.1532560256164466, "learning_rate": 4.531265784422669e-06, "loss": 0.3833, "step": 10830 }, { "epoch": 0.5474609227039722, "grad_norm": 6.167488921508967, "learning_rate": 4.526214769168603e-06, "loss": 0.4006, "step": 10840 }, { "epoch": 0.547965960455544, "grad_norm": 12.229490845691188, "learning_rate": 4.521163753914537e-06, "loss": 0.3803, "step": 10850 }, { "epoch": 0.548470998207116, "grad_norm": 4.1950075492065615, "learning_rate": 4.5161127386604715e-06, "loss": 0.3842, "step": 10860 }, { "epoch": 0.5489760359586879, "grad_norm": 3.654034176301022, "learning_rate": 4.511061723406406e-06, "loss": 0.396, "step": 10870 }, { "epoch": 0.5494810737102599, "grad_norm": 12.04048410661582, "learning_rate": 4.506010708152339e-06, "loss": 0.3902, "step": 10880 }, { "epoch": 0.5499861114618317, "grad_norm": 5.763873510396362, "learning_rate": 4.500959692898273e-06, "loss": 0.3803, "step": 10890 }, { "epoch": 0.5504911492134037, "grad_norm": 5.97333873812127, "learning_rate": 4.495908677644207e-06, "loss": 0.3787, "step": 10900 }, { "epoch": 0.5509961869649757, "grad_norm": 4.51021628683305, "learning_rate": 4.49085766239014e-06, "loss": 0.3835, "step": 10910 }, { "epoch": 0.5515012247165476, "grad_norm": 3.902274506866627, "learning_rate": 4.4858066471360745e-06, "loss": 0.3762, "step": 10920 }, { "epoch": 0.5520062624681195, "grad_norm": 3.0980828622359335, "learning_rate": 4.480755631882009e-06, "loss": 0.3873, "step": 10930 }, { "epoch": 0.5525113002196914, "grad_norm": 2.5033456579398674, "learning_rate": 4.475704616627943e-06, "loss": 0.4027, "step": 10940 }, { "epoch": 0.5530163379712634, "grad_norm": 2.888693890103198, "learning_rate": 4.470653601373877e-06, "loss": 0.3828, "step": 10950 }, { "epoch": 0.5535213757228353, "grad_norm": 3.5149007219506205, "learning_rate": 4.46560258611981e-06, "loss": 0.3797, "step": 10960 }, { "epoch": 0.5540264134744072, "grad_norm": 2.3262448954191894, "learning_rate": 4.460551570865744e-06, "loss": 0.385, "step": 10970 }, { "epoch": 0.5545314512259791, "grad_norm": 2.021941174461837, "learning_rate": 4.4555005556116784e-06, "loss": 0.3902, "step": 10980 }, { "epoch": 0.5550364889775511, "grad_norm": 3.2136340880143694, "learning_rate": 4.450449540357612e-06, "loss": 0.3767, "step": 10990 }, { "epoch": 0.555541526729123, "grad_norm": 6.405644243339184, "learning_rate": 4.445398525103547e-06, "loss": 0.394, "step": 11000 }, { "epoch": 0.5560465644806949, "grad_norm": 3.698639548037572, "learning_rate": 4.44034750984948e-06, "loss": 0.3733, "step": 11010 }, { "epoch": 0.5565516022322669, "grad_norm": 1.8438864220839366, "learning_rate": 4.435296494595414e-06, "loss": 0.387, "step": 11020 }, { "epoch": 0.5570566399838388, "grad_norm": 2.9672016536550654, "learning_rate": 4.430245479341348e-06, "loss": 0.3767, "step": 11030 }, { "epoch": 0.5575616777354108, "grad_norm": 9.95951552056093, "learning_rate": 4.4251944640872815e-06, "loss": 0.3796, "step": 11040 }, { "epoch": 0.5580667154869826, "grad_norm": 3.844799275005631, "learning_rate": 4.420143448833216e-06, "loss": 0.3721, "step": 11050 }, { "epoch": 0.5585717532385546, "grad_norm": 3.606527812089754, "learning_rate": 4.41509243357915e-06, "loss": 0.3726, "step": 11060 }, { "epoch": 0.5590767909901265, "grad_norm": 3.99898244357326, "learning_rate": 4.410041418325084e-06, "loss": 0.3822, "step": 11070 }, { "epoch": 0.5595818287416985, "grad_norm": 2.365878798228634, "learning_rate": 4.404990403071018e-06, "loss": 0.3858, "step": 11080 }, { "epoch": 0.5600868664932703, "grad_norm": 2.0114937465222105, "learning_rate": 4.399939387816951e-06, "loss": 0.3619, "step": 11090 }, { "epoch": 0.5605919042448423, "grad_norm": 4.267506041644642, "learning_rate": 4.394888372562885e-06, "loss": 0.3959, "step": 11100 }, { "epoch": 0.5610969419964142, "grad_norm": 2.3885028725522663, "learning_rate": 4.3898373573088195e-06, "loss": 0.3689, "step": 11110 }, { "epoch": 0.5616019797479862, "grad_norm": 8.297216447663791, "learning_rate": 4.384786342054753e-06, "loss": 0.3775, "step": 11120 }, { "epoch": 0.562107017499558, "grad_norm": 2.8469924568130516, "learning_rate": 4.379735326800687e-06, "loss": 0.3898, "step": 11130 }, { "epoch": 0.56261205525113, "grad_norm": 6.231958717493552, "learning_rate": 4.374684311546621e-06, "loss": 0.3837, "step": 11140 }, { "epoch": 0.563117093002702, "grad_norm": 4.309341792972094, "learning_rate": 4.369633296292555e-06, "loss": 0.3828, "step": 11150 }, { "epoch": 0.5636221307542739, "grad_norm": 3.203446767568462, "learning_rate": 4.364582281038489e-06, "loss": 0.3903, "step": 11160 }, { "epoch": 0.5641271685058458, "grad_norm": 4.843549458393687, "learning_rate": 4.359531265784423e-06, "loss": 0.3855, "step": 11170 }, { "epoch": 0.5646322062574177, "grad_norm": 2.525777084557924, "learning_rate": 4.354480250530357e-06, "loss": 0.3898, "step": 11180 }, { "epoch": 0.5651372440089897, "grad_norm": 1.9829345093280084, "learning_rate": 4.349429235276291e-06, "loss": 0.3981, "step": 11190 }, { "epoch": 0.5656422817605616, "grad_norm": 2.2195349808444162, "learning_rate": 4.344378220022225e-06, "loss": 0.3627, "step": 11200 }, { "epoch": 0.5661473195121335, "grad_norm": 1.8120887020230037, "learning_rate": 4.339327204768159e-06, "loss": 0.3785, "step": 11210 }, { "epoch": 0.5666523572637054, "grad_norm": 3.909050155051535, "learning_rate": 4.334276189514093e-06, "loss": 0.3893, "step": 11220 }, { "epoch": 0.5671573950152774, "grad_norm": 110.40544319977835, "learning_rate": 4.3292251742600264e-06, "loss": 0.3835, "step": 11230 }, { "epoch": 0.5676624327668494, "grad_norm": 2.734463901696715, "learning_rate": 4.3241741590059606e-06, "loss": 0.3996, "step": 11240 }, { "epoch": 0.5681674705184212, "grad_norm": 2.044621346060538, "learning_rate": 4.319123143751895e-06, "loss": 0.3963, "step": 11250 }, { "epoch": 0.5686725082699932, "grad_norm": 2.6336975727846346, "learning_rate": 4.314072128497828e-06, "loss": 0.3862, "step": 11260 }, { "epoch": 0.5691775460215651, "grad_norm": 2.817059832465792, "learning_rate": 4.309021113243763e-06, "loss": 0.3944, "step": 11270 }, { "epoch": 0.5696825837731371, "grad_norm": 3.118909901449835, "learning_rate": 4.303970097989696e-06, "loss": 0.3746, "step": 11280 }, { "epoch": 0.5701876215247089, "grad_norm": 2.419014264509884, "learning_rate": 4.29891908273563e-06, "loss": 0.3756, "step": 11290 }, { "epoch": 0.5706926592762809, "grad_norm": 2.009124011568514, "learning_rate": 4.2938680674815645e-06, "loss": 0.381, "step": 11300 }, { "epoch": 0.5711976970278528, "grad_norm": 1.7912588198575379, "learning_rate": 4.288817052227498e-06, "loss": 0.3854, "step": 11310 }, { "epoch": 0.5717027347794248, "grad_norm": 2.188948676804761, "learning_rate": 4.283766036973432e-06, "loss": 0.4011, "step": 11320 }, { "epoch": 0.5722077725309966, "grad_norm": 5.787714109914013, "learning_rate": 4.278715021719366e-06, "loss": 0.3953, "step": 11330 }, { "epoch": 0.5727128102825686, "grad_norm": 2.895115220305218, "learning_rate": 4.2736640064653e-06, "loss": 0.3835, "step": 11340 }, { "epoch": 0.5732178480341406, "grad_norm": 3.0581001395729013, "learning_rate": 4.268612991211234e-06, "loss": 0.3765, "step": 11350 }, { "epoch": 0.5737228857857125, "grad_norm": 2.8763642234703015, "learning_rate": 4.2635619759571675e-06, "loss": 0.3844, "step": 11360 }, { "epoch": 0.5742279235372844, "grad_norm": 2.73350988345317, "learning_rate": 4.258510960703102e-06, "loss": 0.3838, "step": 11370 }, { "epoch": 0.5747329612888563, "grad_norm": 2.187379524359724, "learning_rate": 4.253459945449036e-06, "loss": 0.3819, "step": 11380 }, { "epoch": 0.5752379990404283, "grad_norm": 2.5046900898952518, "learning_rate": 4.248408930194969e-06, "loss": 0.3959, "step": 11390 }, { "epoch": 0.5757430367920002, "grad_norm": 2.058073939600919, "learning_rate": 4.243357914940903e-06, "loss": 0.3931, "step": 11400 }, { "epoch": 0.5762480745435722, "grad_norm": 2.15231990076239, "learning_rate": 4.238306899686837e-06, "loss": 0.3755, "step": 11410 }, { "epoch": 0.576753112295144, "grad_norm": 2.050683275677417, "learning_rate": 4.233255884432771e-06, "loss": 0.3736, "step": 11420 }, { "epoch": 0.577258150046716, "grad_norm": 2.6748692786354624, "learning_rate": 4.2282048691787055e-06, "loss": 0.3766, "step": 11430 }, { "epoch": 0.577763187798288, "grad_norm": 3.596459258619974, "learning_rate": 4.223153853924639e-06, "loss": 0.3798, "step": 11440 }, { "epoch": 0.5782682255498599, "grad_norm": 3.0261933816595987, "learning_rate": 4.218102838670573e-06, "loss": 0.3958, "step": 11450 }, { "epoch": 0.5787732633014318, "grad_norm": 5.146945239564686, "learning_rate": 4.213051823416507e-06, "loss": 0.3725, "step": 11460 }, { "epoch": 0.5792783010530037, "grad_norm": 2.4698008594522896, "learning_rate": 4.20800080816244e-06, "loss": 0.376, "step": 11470 }, { "epoch": 0.5797833388045757, "grad_norm": 2.107316320060359, "learning_rate": 4.202949792908375e-06, "loss": 0.3629, "step": 11480 }, { "epoch": 0.5802883765561476, "grad_norm": 2.0980153420904575, "learning_rate": 4.197898777654309e-06, "loss": 0.4014, "step": 11490 }, { "epoch": 0.5807934143077195, "grad_norm": 3.2058729016983136, "learning_rate": 4.192847762400243e-06, "loss": 0.3828, "step": 11500 }, { "epoch": 0.5812984520592914, "grad_norm": 2.71859111643498, "learning_rate": 4.187796747146177e-06, "loss": 0.3819, "step": 11510 }, { "epoch": 0.5818034898108634, "grad_norm": 2.4534281139850793, "learning_rate": 4.182745731892111e-06, "loss": 0.3928, "step": 11520 }, { "epoch": 0.5823085275624353, "grad_norm": 1.8301553116699396, "learning_rate": 4.177694716638044e-06, "loss": 0.3654, "step": 11530 }, { "epoch": 0.5828135653140072, "grad_norm": 4.477875878428584, "learning_rate": 4.172643701383979e-06, "loss": 0.3748, "step": 11540 }, { "epoch": 0.5833186030655791, "grad_norm": 13.967741045162365, "learning_rate": 4.1675926861299125e-06, "loss": 0.3756, "step": 11550 }, { "epoch": 0.5838236408171511, "grad_norm": 3.872741784787074, "learning_rate": 4.162541670875847e-06, "loss": 0.3771, "step": 11560 }, { "epoch": 0.5843286785687231, "grad_norm": 4.469317092124396, "learning_rate": 4.157490655621781e-06, "loss": 0.374, "step": 11570 }, { "epoch": 0.5848337163202949, "grad_norm": 4.299796050846851, "learning_rate": 4.152439640367714e-06, "loss": 0.3721, "step": 11580 }, { "epoch": 0.5853387540718669, "grad_norm": 2.72784485464612, "learning_rate": 4.147388625113648e-06, "loss": 0.3752, "step": 11590 }, { "epoch": 0.5858437918234388, "grad_norm": 2.4059258940294574, "learning_rate": 4.142337609859582e-06, "loss": 0.3725, "step": 11600 }, { "epoch": 0.5863488295750108, "grad_norm": 3.311734782523416, "learning_rate": 4.137286594605516e-06, "loss": 0.3798, "step": 11610 }, { "epoch": 0.5868538673265826, "grad_norm": 3.792159108792209, "learning_rate": 4.1322355793514505e-06, "loss": 0.3611, "step": 11620 }, { "epoch": 0.5873589050781546, "grad_norm": 3.7603434596067684, "learning_rate": 4.127184564097384e-06, "loss": 0.3807, "step": 11630 }, { "epoch": 0.5878639428297265, "grad_norm": 2.082805585556261, "learning_rate": 4.122133548843318e-06, "loss": 0.3713, "step": 11640 }, { "epoch": 0.5883689805812985, "grad_norm": 3.7335861923532847, "learning_rate": 4.117082533589252e-06, "loss": 0.37, "step": 11650 }, { "epoch": 0.5888740183328703, "grad_norm": 2.56465711212247, "learning_rate": 4.112031518335185e-06, "loss": 0.3902, "step": 11660 }, { "epoch": 0.5893790560844423, "grad_norm": 2.1557448667197843, "learning_rate": 4.106980503081119e-06, "loss": 0.3943, "step": 11670 }, { "epoch": 0.5898840938360143, "grad_norm": 4.763988402275346, "learning_rate": 4.1019294878270535e-06, "loss": 0.3775, "step": 11680 }, { "epoch": 0.5903891315875862, "grad_norm": 5.278109696777853, "learning_rate": 4.096878472572988e-06, "loss": 0.3893, "step": 11690 }, { "epoch": 0.5908941693391581, "grad_norm": 2.6015113581109466, "learning_rate": 4.091827457318922e-06, "loss": 0.3746, "step": 11700 }, { "epoch": 0.59139920709073, "grad_norm": 2.851515770660457, "learning_rate": 4.086776442064855e-06, "loss": 0.3819, "step": 11710 }, { "epoch": 0.591904244842302, "grad_norm": 2.5969899871366215, "learning_rate": 4.081725426810789e-06, "loss": 0.3728, "step": 11720 }, { "epoch": 0.5924092825938739, "grad_norm": 1.9757415738509565, "learning_rate": 4.076674411556723e-06, "loss": 0.3724, "step": 11730 }, { "epoch": 0.5929143203454458, "grad_norm": 3.277581750970118, "learning_rate": 4.071623396302657e-06, "loss": 0.3688, "step": 11740 }, { "epoch": 0.5934193580970177, "grad_norm": 4.8469992620214155, "learning_rate": 4.0665723810485916e-06, "loss": 0.3562, "step": 11750 }, { "epoch": 0.5939243958485897, "grad_norm": 2.323267430569943, "learning_rate": 4.061521365794525e-06, "loss": 0.3907, "step": 11760 }, { "epoch": 0.5944294336001616, "grad_norm": 1.9799231963200372, "learning_rate": 4.056470350540459e-06, "loss": 0.3894, "step": 11770 }, { "epoch": 0.5949344713517335, "grad_norm": 2.576381259976751, "learning_rate": 4.051419335286393e-06, "loss": 0.3709, "step": 11780 }, { "epoch": 0.5954395091033055, "grad_norm": 3.283926951217961, "learning_rate": 4.046368320032326e-06, "loss": 0.3938, "step": 11790 }, { "epoch": 0.5959445468548774, "grad_norm": 3.29273998482733, "learning_rate": 4.0413173047782605e-06, "loss": 0.3896, "step": 11800 }, { "epoch": 0.5964495846064494, "grad_norm": 2.332643126609973, "learning_rate": 4.036266289524195e-06, "loss": 0.3719, "step": 11810 }, { "epoch": 0.5969546223580212, "grad_norm": 3.2558173835297963, "learning_rate": 4.031215274270129e-06, "loss": 0.3741, "step": 11820 }, { "epoch": 0.5974596601095932, "grad_norm": 6.835841939224201, "learning_rate": 4.026164259016063e-06, "loss": 0.363, "step": 11830 }, { "epoch": 0.5979646978611651, "grad_norm": 2.1914586837180687, "learning_rate": 4.021113243761996e-06, "loss": 0.3618, "step": 11840 }, { "epoch": 0.5984697356127371, "grad_norm": 2.360393516962277, "learning_rate": 4.01606222850793e-06, "loss": 0.3726, "step": 11850 }, { "epoch": 0.5989747733643089, "grad_norm": 2.46922458613829, "learning_rate": 4.011011213253864e-06, "loss": 0.3649, "step": 11860 }, { "epoch": 0.5994798111158809, "grad_norm": 1.886510326818929, "learning_rate": 4.005960197999798e-06, "loss": 0.371, "step": 11870 }, { "epoch": 0.5999848488674528, "grad_norm": 4.097224486709572, "learning_rate": 4.000909182745733e-06, "loss": 0.3944, "step": 11880 }, { "epoch": 0.6004898866190248, "grad_norm": 7.100119835211903, "learning_rate": 3.995858167491667e-06, "loss": 0.3665, "step": 11890 }, { "epoch": 0.6009949243705967, "grad_norm": 1.7863799930508097, "learning_rate": 3.9908071522376e-06, "loss": 0.3748, "step": 11900 }, { "epoch": 0.6014999621221686, "grad_norm": 3.0047977958425487, "learning_rate": 3.985756136983534e-06, "loss": 0.3869, "step": 11910 }, { "epoch": 0.6020049998737406, "grad_norm": 3.077659231433922, "learning_rate": 3.980705121729468e-06, "loss": 0.3618, "step": 11920 }, { "epoch": 0.6025100376253125, "grad_norm": 5.808559896098692, "learning_rate": 3.9756541064754015e-06, "loss": 0.3735, "step": 11930 }, { "epoch": 0.6030150753768844, "grad_norm": 2.1197853890011285, "learning_rate": 3.970603091221336e-06, "loss": 0.3901, "step": 11940 }, { "epoch": 0.6035201131284563, "grad_norm": 2.0357339213647054, "learning_rate": 3.96555207596727e-06, "loss": 0.3768, "step": 11950 }, { "epoch": 0.6040251508800283, "grad_norm": 1.9386614623259042, "learning_rate": 3.960501060713204e-06, "loss": 0.371, "step": 11960 }, { "epoch": 0.6045301886316002, "grad_norm": 3.573185030358563, "learning_rate": 3.955450045459138e-06, "loss": 0.3748, "step": 11970 }, { "epoch": 0.6050352263831721, "grad_norm": 2.6490276092763225, "learning_rate": 3.950399030205071e-06, "loss": 0.3907, "step": 11980 }, { "epoch": 0.605540264134744, "grad_norm": 1.9765526236319297, "learning_rate": 3.9453480149510054e-06, "loss": 0.3851, "step": 11990 }, { "epoch": 0.606045301886316, "grad_norm": 2.6148181435511866, "learning_rate": 3.9402969996969396e-06, "loss": 0.3829, "step": 12000 }, { "epoch": 0.606550339637888, "grad_norm": 2.0137917601843194, "learning_rate": 3.935245984442873e-06, "loss": 0.3876, "step": 12010 }, { "epoch": 0.6070553773894599, "grad_norm": 2.183286213214996, "learning_rate": 3.930194969188808e-06, "loss": 0.3852, "step": 12020 }, { "epoch": 0.6075604151410318, "grad_norm": 1.8996262451525328, "learning_rate": 3.925143953934741e-06, "loss": 0.3737, "step": 12030 }, { "epoch": 0.6080654528926037, "grad_norm": 2.1458215777612293, "learning_rate": 3.920092938680675e-06, "loss": 0.3705, "step": 12040 }, { "epoch": 0.6085704906441757, "grad_norm": 1.611205289782356, "learning_rate": 3.915041923426609e-06, "loss": 0.3774, "step": 12050 }, { "epoch": 0.6090755283957476, "grad_norm": 1.8784320811926083, "learning_rate": 3.909990908172543e-06, "loss": 0.3705, "step": 12060 }, { "epoch": 0.6095805661473195, "grad_norm": 2.0214114750705283, "learning_rate": 3.904939892918477e-06, "loss": 0.3579, "step": 12070 }, { "epoch": 0.6100856038988914, "grad_norm": 2.52455375915045, "learning_rate": 3.899888877664411e-06, "loss": 0.3757, "step": 12080 }, { "epoch": 0.6105906416504634, "grad_norm": 2.7488427549157963, "learning_rate": 3.894837862410345e-06, "loss": 0.3753, "step": 12090 }, { "epoch": 0.6110956794020354, "grad_norm": 2.366101694404488, "learning_rate": 3.889786847156279e-06, "loss": 0.3583, "step": 12100 }, { "epoch": 0.6116007171536072, "grad_norm": 4.526955084775627, "learning_rate": 3.884735831902212e-06, "loss": 0.373, "step": 12110 }, { "epoch": 0.6121057549051792, "grad_norm": 1.6632942025151354, "learning_rate": 3.8796848166481465e-06, "loss": 0.3797, "step": 12120 }, { "epoch": 0.6126107926567511, "grad_norm": 2.122544662038796, "learning_rate": 3.874633801394081e-06, "loss": 0.3763, "step": 12130 }, { "epoch": 0.6131158304083231, "grad_norm": 2.136310843503549, "learning_rate": 3.869582786140014e-06, "loss": 0.3822, "step": 12140 }, { "epoch": 0.6136208681598949, "grad_norm": 1.701221883208955, "learning_rate": 3.864531770885949e-06, "loss": 0.3636, "step": 12150 }, { "epoch": 0.6141259059114669, "grad_norm": 1.9524548661829169, "learning_rate": 3.859480755631882e-06, "loss": 0.3913, "step": 12160 }, { "epoch": 0.6146309436630388, "grad_norm": 2.58084193291891, "learning_rate": 3.854429740377816e-06, "loss": 0.3647, "step": 12170 }, { "epoch": 0.6151359814146108, "grad_norm": 2.6683083927745552, "learning_rate": 3.84937872512375e-06, "loss": 0.3834, "step": 12180 }, { "epoch": 0.6156410191661826, "grad_norm": 5.415821035201854, "learning_rate": 3.844327709869684e-06, "loss": 0.3679, "step": 12190 }, { "epoch": 0.6161460569177546, "grad_norm": 1.7693494201780369, "learning_rate": 3.839276694615618e-06, "loss": 0.3675, "step": 12200 }, { "epoch": 0.6166510946693265, "grad_norm": 2.025826626656601, "learning_rate": 3.834225679361552e-06, "loss": 0.3807, "step": 12210 }, { "epoch": 0.6171561324208985, "grad_norm": 2.409768770452221, "learning_rate": 3.829174664107486e-06, "loss": 0.3718, "step": 12220 }, { "epoch": 0.6176611701724704, "grad_norm": 2.81111834040087, "learning_rate": 3.82412364885342e-06, "loss": 0.3809, "step": 12230 }, { "epoch": 0.6181662079240423, "grad_norm": 2.261752998915838, "learning_rate": 3.8190726335993535e-06, "loss": 0.392, "step": 12240 }, { "epoch": 0.6186712456756143, "grad_norm": 4.1776939292020066, "learning_rate": 3.8140216183452876e-06, "loss": 0.3738, "step": 12250 }, { "epoch": 0.6191762834271862, "grad_norm": 2.1437198828433197, "learning_rate": 3.8089706030912217e-06, "loss": 0.3719, "step": 12260 }, { "epoch": 0.6196813211787581, "grad_norm": 3.075348229616783, "learning_rate": 3.8039195878371554e-06, "loss": 0.3706, "step": 12270 }, { "epoch": 0.62018635893033, "grad_norm": 2.0146213511322566, "learning_rate": 3.798868572583089e-06, "loss": 0.3897, "step": 12280 }, { "epoch": 0.620691396681902, "grad_norm": 3.0389012573946625, "learning_rate": 3.7938175573290236e-06, "loss": 0.3744, "step": 12290 }, { "epoch": 0.6211964344334739, "grad_norm": 3.500847066826284, "learning_rate": 3.7887665420749573e-06, "loss": 0.3645, "step": 12300 }, { "epoch": 0.6217014721850458, "grad_norm": 4.562441605553007, "learning_rate": 3.7837155268208915e-06, "loss": 0.3738, "step": 12310 }, { "epoch": 0.6222065099366177, "grad_norm": 2.8281857847564957, "learning_rate": 3.778664511566825e-06, "loss": 0.3645, "step": 12320 }, { "epoch": 0.6227115476881897, "grad_norm": 4.569461603791169, "learning_rate": 3.7736134963127593e-06, "loss": 0.3525, "step": 12330 }, { "epoch": 0.6232165854397617, "grad_norm": 3.3932139149292198, "learning_rate": 3.768562481058693e-06, "loss": 0.3686, "step": 12340 }, { "epoch": 0.6237216231913335, "grad_norm": 3.208108830968236, "learning_rate": 3.7635114658046267e-06, "loss": 0.3675, "step": 12350 }, { "epoch": 0.6242266609429055, "grad_norm": 3.325030777041727, "learning_rate": 3.7584604505505612e-06, "loss": 0.3827, "step": 12360 }, { "epoch": 0.6247316986944774, "grad_norm": 3.001621471243721, "learning_rate": 3.753409435296495e-06, "loss": 0.3673, "step": 12370 }, { "epoch": 0.6252367364460494, "grad_norm": 2.7314438383701263, "learning_rate": 3.748358420042429e-06, "loss": 0.3831, "step": 12380 }, { "epoch": 0.6257417741976212, "grad_norm": 4.37429312079951, "learning_rate": 3.7433074047883628e-06, "loss": 0.3665, "step": 12390 }, { "epoch": 0.6262468119491932, "grad_norm": 3.9619988373008077, "learning_rate": 3.7382563895342965e-06, "loss": 0.3747, "step": 12400 }, { "epoch": 0.6267518497007651, "grad_norm": 4.80089629937948, "learning_rate": 3.7332053742802306e-06, "loss": 0.3777, "step": 12410 }, { "epoch": 0.6272568874523371, "grad_norm": 3.8675160835398485, "learning_rate": 3.7281543590261643e-06, "loss": 0.3644, "step": 12420 }, { "epoch": 0.6277619252039089, "grad_norm": 2.235819960025121, "learning_rate": 3.723103343772099e-06, "loss": 0.37, "step": 12430 }, { "epoch": 0.6282669629554809, "grad_norm": 2.1802012458708164, "learning_rate": 3.7180523285180325e-06, "loss": 0.3783, "step": 12440 }, { "epoch": 0.6287720007070529, "grad_norm": 5.254753451223298, "learning_rate": 3.7130013132639662e-06, "loss": 0.3778, "step": 12450 }, { "epoch": 0.6292770384586248, "grad_norm": 3.0569298328433794, "learning_rate": 3.7079502980099004e-06, "loss": 0.3708, "step": 12460 }, { "epoch": 0.6297820762101967, "grad_norm": 3.453756701658818, "learning_rate": 3.702899282755834e-06, "loss": 0.3614, "step": 12470 }, { "epoch": 0.6302871139617686, "grad_norm": 3.0406418341037105, "learning_rate": 3.6978482675017678e-06, "loss": 0.3605, "step": 12480 }, { "epoch": 0.6307921517133406, "grad_norm": 5.948695461444471, "learning_rate": 3.6927972522477023e-06, "loss": 0.3769, "step": 12490 }, { "epoch": 0.6312971894649125, "grad_norm": 3.6565401647116986, "learning_rate": 3.687746236993636e-06, "loss": 0.3598, "step": 12500 }, { "epoch": 0.6318022272164844, "grad_norm": 2.386153355802283, "learning_rate": 3.68269522173957e-06, "loss": 0.3668, "step": 12510 }, { "epoch": 0.6323072649680563, "grad_norm": 5.317495105141212, "learning_rate": 3.677644206485504e-06, "loss": 0.3678, "step": 12520 }, { "epoch": 0.6328123027196283, "grad_norm": 6.576082498743726, "learning_rate": 3.672593191231438e-06, "loss": 0.3668, "step": 12530 }, { "epoch": 0.6333173404712003, "grad_norm": 6.268613985781669, "learning_rate": 3.6675421759773717e-06, "loss": 0.3635, "step": 12540 }, { "epoch": 0.6338223782227721, "grad_norm": 5.931534215833603, "learning_rate": 3.6624911607233054e-06, "loss": 0.3836, "step": 12550 }, { "epoch": 0.634327415974344, "grad_norm": 3.414474702810524, "learning_rate": 3.65744014546924e-06, "loss": 0.3638, "step": 12560 }, { "epoch": 0.634832453725916, "grad_norm": 2.5786921065802906, "learning_rate": 3.6523891302151736e-06, "loss": 0.3792, "step": 12570 }, { "epoch": 0.635337491477488, "grad_norm": 3.3296465725491973, "learning_rate": 3.6473381149611077e-06, "loss": 0.3726, "step": 12580 }, { "epoch": 0.6358425292290599, "grad_norm": 5.112120958135022, "learning_rate": 3.6422870997070414e-06, "loss": 0.3608, "step": 12590 }, { "epoch": 0.6363475669806318, "grad_norm": 4.712824755681933, "learning_rate": 3.637236084452975e-06, "loss": 0.361, "step": 12600 }, { "epoch": 0.6368526047322037, "grad_norm": 2.557395610295011, "learning_rate": 3.6321850691989093e-06, "loss": 0.3646, "step": 12610 }, { "epoch": 0.6373576424837757, "grad_norm": 2.4265764249895962, "learning_rate": 3.627134053944843e-06, "loss": 0.3673, "step": 12620 }, { "epoch": 0.6378626802353476, "grad_norm": 9.111738338010397, "learning_rate": 3.6220830386907775e-06, "loss": 0.3748, "step": 12630 }, { "epoch": 0.6383677179869195, "grad_norm": 13.21927265302919, "learning_rate": 3.617032023436711e-06, "loss": 0.3736, "step": 12640 }, { "epoch": 0.6388727557384914, "grad_norm": 3.433344571738311, "learning_rate": 3.611981008182645e-06, "loss": 0.3804, "step": 12650 }, { "epoch": 0.6393777934900634, "grad_norm": 3.3546640582148393, "learning_rate": 3.606929992928579e-06, "loss": 0.3512, "step": 12660 }, { "epoch": 0.6398828312416354, "grad_norm": 2.665395597695038, "learning_rate": 3.6018789776745127e-06, "loss": 0.3777, "step": 12670 }, { "epoch": 0.6403878689932072, "grad_norm": 3.7554388445793343, "learning_rate": 3.5968279624204464e-06, "loss": 0.3745, "step": 12680 }, { "epoch": 0.6408929067447792, "grad_norm": 5.3470856792032695, "learning_rate": 3.5917769471663805e-06, "loss": 0.3712, "step": 12690 }, { "epoch": 0.6413979444963511, "grad_norm": 3.090410688880816, "learning_rate": 3.586725931912315e-06, "loss": 0.3621, "step": 12700 }, { "epoch": 0.6419029822479231, "grad_norm": 2.664925787957406, "learning_rate": 3.581674916658249e-06, "loss": 0.3543, "step": 12710 }, { "epoch": 0.6424080199994949, "grad_norm": 2.449259146215423, "learning_rate": 3.5766239014041825e-06, "loss": 0.3637, "step": 12720 }, { "epoch": 0.6429130577510669, "grad_norm": 3.159318328572562, "learning_rate": 3.5715728861501166e-06, "loss": 0.3793, "step": 12730 }, { "epoch": 0.6434180955026388, "grad_norm": 2.7133150428468835, "learning_rate": 3.5665218708960503e-06, "loss": 0.3798, "step": 12740 }, { "epoch": 0.6439231332542108, "grad_norm": 3.374200452364885, "learning_rate": 3.561470855641984e-06, "loss": 0.3624, "step": 12750 }, { "epoch": 0.6444281710057826, "grad_norm": 2.9622292324304365, "learning_rate": 3.556419840387918e-06, "loss": 0.3553, "step": 12760 }, { "epoch": 0.6449332087573546, "grad_norm": 2.609047480998997, "learning_rate": 3.5513688251338523e-06, "loss": 0.3721, "step": 12770 }, { "epoch": 0.6454382465089266, "grad_norm": 5.058720798344808, "learning_rate": 3.5463178098797864e-06, "loss": 0.3486, "step": 12780 }, { "epoch": 0.6459432842604985, "grad_norm": 2.2596797012742558, "learning_rate": 3.54126679462572e-06, "loss": 0.3696, "step": 12790 }, { "epoch": 0.6464483220120704, "grad_norm": 2.5737345357996815, "learning_rate": 3.536215779371654e-06, "loss": 0.3534, "step": 12800 }, { "epoch": 0.6469533597636423, "grad_norm": 3.9591498502643305, "learning_rate": 3.531164764117588e-06, "loss": 0.3635, "step": 12810 }, { "epoch": 0.6474583975152143, "grad_norm": 4.5793714339332805, "learning_rate": 3.5261137488635216e-06, "loss": 0.3633, "step": 12820 }, { "epoch": 0.6479634352667862, "grad_norm": 2.4696926794812173, "learning_rate": 3.521062733609456e-06, "loss": 0.3681, "step": 12830 }, { "epoch": 0.6484684730183581, "grad_norm": 10.025961417450299, "learning_rate": 3.51601171835539e-06, "loss": 0.3645, "step": 12840 }, { "epoch": 0.64897351076993, "grad_norm": 2.2863708800017934, "learning_rate": 3.5109607031013236e-06, "loss": 0.3686, "step": 12850 }, { "epoch": 0.649478548521502, "grad_norm": 1.9177686162549965, "learning_rate": 3.5059096878472577e-06, "loss": 0.3673, "step": 12860 }, { "epoch": 0.649983586273074, "grad_norm": 5.445065512861327, "learning_rate": 3.5008586725931914e-06, "loss": 0.3801, "step": 12870 }, { "epoch": 0.6504886240246458, "grad_norm": 2.6676713316438336, "learning_rate": 3.4958076573391255e-06, "loss": 0.3779, "step": 12880 }, { "epoch": 0.6509936617762178, "grad_norm": 3.8611942525569374, "learning_rate": 3.490756642085059e-06, "loss": 0.375, "step": 12890 }, { "epoch": 0.6514986995277897, "grad_norm": 3.774007520734188, "learning_rate": 3.4857056268309938e-06, "loss": 0.3768, "step": 12900 }, { "epoch": 0.6520037372793617, "grad_norm": 3.200552688232275, "learning_rate": 3.4806546115769275e-06, "loss": 0.3831, "step": 12910 }, { "epoch": 0.6525087750309335, "grad_norm": 1.9098229490724261, "learning_rate": 3.475603596322861e-06, "loss": 0.379, "step": 12920 }, { "epoch": 0.6530138127825055, "grad_norm": 3.943477029717605, "learning_rate": 3.4705525810687953e-06, "loss": 0.3712, "step": 12930 }, { "epoch": 0.6535188505340774, "grad_norm": 4.9375810935188165, "learning_rate": 3.465501565814729e-06, "loss": 0.3727, "step": 12940 }, { "epoch": 0.6540238882856494, "grad_norm": 5.033670107006043, "learning_rate": 3.4604505505606627e-06, "loss": 0.3663, "step": 12950 }, { "epoch": 0.6545289260372212, "grad_norm": 3.064561646751532, "learning_rate": 3.455399535306597e-06, "loss": 0.3777, "step": 12960 }, { "epoch": 0.6550339637887932, "grad_norm": 2.3278833688543594, "learning_rate": 3.450348520052531e-06, "loss": 0.3539, "step": 12970 }, { "epoch": 0.6555390015403652, "grad_norm": 2.194839952891626, "learning_rate": 3.445297504798465e-06, "loss": 0.3623, "step": 12980 }, { "epoch": 0.6560440392919371, "grad_norm": 4.478106964413342, "learning_rate": 3.4402464895443987e-06, "loss": 0.3717, "step": 12990 }, { "epoch": 0.656549077043509, "grad_norm": 3.862763827323303, "learning_rate": 3.4351954742903324e-06, "loss": 0.3497, "step": 13000 }, { "epoch": 0.6570541147950809, "grad_norm": 6.994733245951842, "learning_rate": 3.4301444590362666e-06, "loss": 0.3814, "step": 13010 }, { "epoch": 0.6575591525466529, "grad_norm": 5.798821632935566, "learning_rate": 3.4250934437822003e-06, "loss": 0.3632, "step": 13020 }, { "epoch": 0.6580641902982248, "grad_norm": 4.04056214450326, "learning_rate": 3.420042428528134e-06, "loss": 0.3848, "step": 13030 }, { "epoch": 0.6585692280497967, "grad_norm": 2.567673036911413, "learning_rate": 3.4149914132740685e-06, "loss": 0.3613, "step": 13040 }, { "epoch": 0.6590742658013686, "grad_norm": 4.569693836912058, "learning_rate": 3.4099403980200022e-06, "loss": 0.3689, "step": 13050 }, { "epoch": 0.6595793035529406, "grad_norm": 7.024167809375034, "learning_rate": 3.4048893827659363e-06, "loss": 0.3752, "step": 13060 }, { "epoch": 0.6600843413045125, "grad_norm": 15.307342736787076, "learning_rate": 3.39983836751187e-06, "loss": 0.3563, "step": 13070 }, { "epoch": 0.6605893790560844, "grad_norm": 5.154751774553765, "learning_rate": 3.394787352257804e-06, "loss": 0.374, "step": 13080 }, { "epoch": 0.6610944168076563, "grad_norm": 6.4670558523466894, "learning_rate": 3.389736337003738e-06, "loss": 0.3662, "step": 13090 }, { "epoch": 0.6615994545592283, "grad_norm": 4.631202509755095, "learning_rate": 3.3846853217496724e-06, "loss": 0.3719, "step": 13100 }, { "epoch": 0.6621044923108003, "grad_norm": 2.638649824542641, "learning_rate": 3.379634306495606e-06, "loss": 0.3649, "step": 13110 }, { "epoch": 0.6626095300623721, "grad_norm": 4.93421678581268, "learning_rate": 3.37458329124154e-06, "loss": 0.3583, "step": 13120 }, { "epoch": 0.6631145678139441, "grad_norm": 8.596147858750863, "learning_rate": 3.369532275987474e-06, "loss": 0.3757, "step": 13130 }, { "epoch": 0.663619605565516, "grad_norm": 2.389686125334587, "learning_rate": 3.3644812607334076e-06, "loss": 0.3665, "step": 13140 }, { "epoch": 0.664124643317088, "grad_norm": 7.517201643559354, "learning_rate": 3.3594302454793413e-06, "loss": 0.3772, "step": 13150 }, { "epoch": 0.6646296810686598, "grad_norm": 3.2962729985486843, "learning_rate": 3.3543792302252755e-06, "loss": 0.3662, "step": 13160 }, { "epoch": 0.6651347188202318, "grad_norm": 2.696587145771749, "learning_rate": 3.3493282149712096e-06, "loss": 0.3784, "step": 13170 }, { "epoch": 0.6656397565718037, "grad_norm": 9.007208510801052, "learning_rate": 3.3442771997171437e-06, "loss": 0.3754, "step": 13180 }, { "epoch": 0.6661447943233757, "grad_norm": 3.6378171433319384, "learning_rate": 3.3392261844630774e-06, "loss": 0.36, "step": 13190 }, { "epoch": 0.6666498320749477, "grad_norm": 5.542336432011706, "learning_rate": 3.334175169209011e-06, "loss": 0.3594, "step": 13200 }, { "epoch": 0.6671548698265195, "grad_norm": 3.865761953639028, "learning_rate": 3.3291241539549452e-06, "loss": 0.3627, "step": 13210 }, { "epoch": 0.6676599075780915, "grad_norm": 8.688331722475619, "learning_rate": 3.324073138700879e-06, "loss": 0.3583, "step": 13220 }, { "epoch": 0.6681649453296634, "grad_norm": 7.506416675568487, "learning_rate": 3.319022123446813e-06, "loss": 0.389, "step": 13230 }, { "epoch": 0.6686699830812354, "grad_norm": 5.431932592827668, "learning_rate": 3.313971108192747e-06, "loss": 0.3544, "step": 13240 }, { "epoch": 0.6691750208328072, "grad_norm": 5.709981208456748, "learning_rate": 3.3089200929386813e-06, "loss": 0.3872, "step": 13250 }, { "epoch": 0.6696800585843792, "grad_norm": 3.72615004588395, "learning_rate": 3.303869077684615e-06, "loss": 0.3694, "step": 13260 }, { "epoch": 0.6701850963359511, "grad_norm": 4.672631711696132, "learning_rate": 3.2988180624305487e-06, "loss": 0.3552, "step": 13270 }, { "epoch": 0.6706901340875231, "grad_norm": 4.166097239004073, "learning_rate": 3.293767047176483e-06, "loss": 0.3705, "step": 13280 }, { "epoch": 0.6711951718390949, "grad_norm": 9.195919275508842, "learning_rate": 3.2887160319224165e-06, "loss": 0.3688, "step": 13290 }, { "epoch": 0.6717002095906669, "grad_norm": 6.6636791033610185, "learning_rate": 3.2836650166683502e-06, "loss": 0.3621, "step": 13300 }, { "epoch": 0.6722052473422389, "grad_norm": 5.256124870002979, "learning_rate": 3.2786140014142848e-06, "loss": 0.3611, "step": 13310 }, { "epoch": 0.6727102850938108, "grad_norm": 7.691842196761616, "learning_rate": 3.2735629861602185e-06, "loss": 0.3634, "step": 13320 }, { "epoch": 0.6732153228453827, "grad_norm": 4.350658776477639, "learning_rate": 3.2685119709061526e-06, "loss": 0.3588, "step": 13330 }, { "epoch": 0.6737203605969546, "grad_norm": 11.623446284645299, "learning_rate": 3.2634609556520863e-06, "loss": 0.371, "step": 13340 }, { "epoch": 0.6742253983485266, "grad_norm": 3.656024443945701, "learning_rate": 3.25840994039802e-06, "loss": 0.3496, "step": 13350 }, { "epoch": 0.6747304361000985, "grad_norm": 4.747809479838541, "learning_rate": 3.253358925143954e-06, "loss": 0.369, "step": 13360 }, { "epoch": 0.6752354738516704, "grad_norm": 3.045504794009347, "learning_rate": 3.248307909889888e-06, "loss": 0.3675, "step": 13370 }, { "epoch": 0.6757405116032423, "grad_norm": 4.5225509793672405, "learning_rate": 3.2432568946358224e-06, "loss": 0.364, "step": 13380 }, { "epoch": 0.6762455493548143, "grad_norm": 6.376859379432204, "learning_rate": 3.238205879381756e-06, "loss": 0.3538, "step": 13390 }, { "epoch": 0.6767505871063862, "grad_norm": 4.459340178610681, "learning_rate": 3.2331548641276898e-06, "loss": 0.3549, "step": 13400 }, { "epoch": 0.6772556248579581, "grad_norm": 4.009290064517063, "learning_rate": 3.228103848873624e-06, "loss": 0.3635, "step": 13410 }, { "epoch": 0.67776066260953, "grad_norm": 4.352335627454303, "learning_rate": 3.2230528336195576e-06, "loss": 0.3642, "step": 13420 }, { "epoch": 0.678265700361102, "grad_norm": 8.538415021418777, "learning_rate": 3.2180018183654917e-06, "loss": 0.3638, "step": 13430 }, { "epoch": 0.678770738112674, "grad_norm": 9.929796560384965, "learning_rate": 3.212950803111426e-06, "loss": 0.3733, "step": 13440 }, { "epoch": 0.6792757758642458, "grad_norm": 10.24812538481589, "learning_rate": 3.20789978785736e-06, "loss": 0.3733, "step": 13450 }, { "epoch": 0.6797808136158178, "grad_norm": 4.911901994444236, "learning_rate": 3.2028487726032937e-06, "loss": 0.3546, "step": 13460 }, { "epoch": 0.6802858513673897, "grad_norm": 9.459161115684884, "learning_rate": 3.1977977573492274e-06, "loss": 0.3604, "step": 13470 }, { "epoch": 0.6807908891189617, "grad_norm": 58.44069759388177, "learning_rate": 3.1927467420951615e-06, "loss": 0.3682, "step": 13480 }, { "epoch": 0.6812959268705335, "grad_norm": 5.773542468847822, "learning_rate": 3.187695726841095e-06, "loss": 0.3654, "step": 13490 }, { "epoch": 0.6818009646221055, "grad_norm": 7.236039686643189, "learning_rate": 3.182644711587029e-06, "loss": 0.3581, "step": 13500 }, { "epoch": 0.6823060023736774, "grad_norm": 8.164527538084936, "learning_rate": 3.1775936963329634e-06, "loss": 0.3719, "step": 13510 }, { "epoch": 0.6828110401252494, "grad_norm": 8.750460822533052, "learning_rate": 3.172542681078897e-06, "loss": 0.3685, "step": 13520 }, { "epoch": 0.6833160778768212, "grad_norm": 8.31780664281577, "learning_rate": 3.1674916658248313e-06, "loss": 0.3738, "step": 13530 }, { "epoch": 0.6838211156283932, "grad_norm": 39.92814930883877, "learning_rate": 3.162440650570765e-06, "loss": 0.3561, "step": 13540 }, { "epoch": 0.6843261533799652, "grad_norm": 7.250183885140758, "learning_rate": 3.1573896353166987e-06, "loss": 0.3763, "step": 13550 }, { "epoch": 0.6848311911315371, "grad_norm": 7.545512249578723, "learning_rate": 3.1523386200626328e-06, "loss": 0.3619, "step": 13560 }, { "epoch": 0.685336228883109, "grad_norm": 13.136078405966021, "learning_rate": 3.1472876048085665e-06, "loss": 0.3506, "step": 13570 }, { "epoch": 0.6858412666346809, "grad_norm": 12.847273364738184, "learning_rate": 3.142236589554501e-06, "loss": 0.3718, "step": 13580 }, { "epoch": 0.6863463043862529, "grad_norm": 5.131472126489263, "learning_rate": 3.1371855743004347e-06, "loss": 0.3573, "step": 13590 }, { "epoch": 0.6868513421378248, "grad_norm": 8.846171149150827, "learning_rate": 3.132134559046369e-06, "loss": 0.3645, "step": 13600 }, { "epoch": 0.6873563798893967, "grad_norm": 7.08666632645302, "learning_rate": 3.1270835437923026e-06, "loss": 0.3863, "step": 13610 }, { "epoch": 0.6878614176409686, "grad_norm": 9.125152763348956, "learning_rate": 3.1220325285382363e-06, "loss": 0.3612, "step": 13620 }, { "epoch": 0.6883664553925406, "grad_norm": 22.795792171268904, "learning_rate": 3.1169815132841704e-06, "loss": 0.3514, "step": 13630 }, { "epoch": 0.6888714931441126, "grad_norm": 13.972423486929486, "learning_rate": 3.111930498030104e-06, "loss": 0.343, "step": 13640 }, { "epoch": 0.6893765308956844, "grad_norm": 7.146064054239514, "learning_rate": 3.1068794827760386e-06, "loss": 0.3634, "step": 13650 }, { "epoch": 0.6898815686472564, "grad_norm": 12.633883151844028, "learning_rate": 3.1018284675219723e-06, "loss": 0.3622, "step": 13660 }, { "epoch": 0.6903866063988283, "grad_norm": 10.567266022971053, "learning_rate": 3.096777452267906e-06, "loss": 0.3775, "step": 13670 }, { "epoch": 0.6908916441504003, "grad_norm": 10.665161639920084, "learning_rate": 3.09172643701384e-06, "loss": 0.3473, "step": 13680 }, { "epoch": 0.6913966819019721, "grad_norm": 13.421618433035983, "learning_rate": 3.086675421759774e-06, "loss": 0.3566, "step": 13690 }, { "epoch": 0.6919017196535441, "grad_norm": 19.50453127590542, "learning_rate": 3.0816244065057076e-06, "loss": 0.3524, "step": 13700 }, { "epoch": 0.692406757405116, "grad_norm": 11.56456458693372, "learning_rate": 3.0765733912516417e-06, "loss": 0.3489, "step": 13710 }, { "epoch": 0.692911795156688, "grad_norm": 9.379959657537876, "learning_rate": 3.071522375997576e-06, "loss": 0.3736, "step": 13720 }, { "epoch": 0.6934168329082598, "grad_norm": 22.486832278224945, "learning_rate": 3.06647136074351e-06, "loss": 0.3586, "step": 13730 }, { "epoch": 0.6939218706598318, "grad_norm": 10.367615306255258, "learning_rate": 3.0614203454894436e-06, "loss": 0.3598, "step": 13740 }, { "epoch": 0.6944269084114038, "grad_norm": 13.659141357148844, "learning_rate": 3.0563693302353773e-06, "loss": 0.3559, "step": 13750 }, { "epoch": 0.6949319461629757, "grad_norm": 28.090360883092867, "learning_rate": 3.0513183149813114e-06, "loss": 0.3572, "step": 13760 }, { "epoch": 0.6954369839145476, "grad_norm": 18.453465716800892, "learning_rate": 3.046267299727245e-06, "loss": 0.3724, "step": 13770 }, { "epoch": 0.6959420216661195, "grad_norm": 29.69383075969026, "learning_rate": 3.0412162844731797e-06, "loss": 0.3715, "step": 13780 }, { "epoch": 0.6964470594176915, "grad_norm": 5.487134314962381, "learning_rate": 3.0361652692191134e-06, "loss": 0.3498, "step": 13790 }, { "epoch": 0.6969520971692634, "grad_norm": 5.672011269458868, "learning_rate": 3.0311142539650475e-06, "loss": 0.3681, "step": 13800 }, { "epoch": 0.6974571349208354, "grad_norm": 4.839559671681653, "learning_rate": 3.0260632387109812e-06, "loss": 0.3623, "step": 13810 }, { "epoch": 0.6979621726724072, "grad_norm": 4.235787248204053, "learning_rate": 3.021012223456915e-06, "loss": 0.3607, "step": 13820 }, { "epoch": 0.6984672104239792, "grad_norm": 4.608668652433986, "learning_rate": 3.015961208202849e-06, "loss": 0.3477, "step": 13830 }, { "epoch": 0.6989722481755511, "grad_norm": 4.704716321355951, "learning_rate": 3.0109101929487827e-06, "loss": 0.3685, "step": 13840 }, { "epoch": 0.6994772859271231, "grad_norm": 6.503590629238218, "learning_rate": 3.0058591776947173e-06, "loss": 0.3721, "step": 13850 }, { "epoch": 0.699982323678695, "grad_norm": 4.428750198778166, "learning_rate": 3.000808162440651e-06, "loss": 0.3712, "step": 13860 }, { "epoch": 0.7004873614302669, "grad_norm": 8.223361489268525, "learning_rate": 2.9957571471865847e-06, "loss": 0.3615, "step": 13870 }, { "epoch": 0.7009923991818389, "grad_norm": 3.8707110375759908, "learning_rate": 2.990706131932519e-06, "loss": 0.368, "step": 13880 }, { "epoch": 0.7014974369334108, "grad_norm": 3.3033672047102853, "learning_rate": 2.9856551166784525e-06, "loss": 0.3662, "step": 13890 }, { "epoch": 0.7020024746849827, "grad_norm": 5.8378570543101045, "learning_rate": 2.9806041014243862e-06, "loss": 0.3766, "step": 13900 }, { "epoch": 0.7025075124365546, "grad_norm": 3.6651748652891425, "learning_rate": 2.9755530861703203e-06, "loss": 0.3661, "step": 13910 }, { "epoch": 0.7030125501881266, "grad_norm": 5.784855966194933, "learning_rate": 2.9705020709162545e-06, "loss": 0.367, "step": 13920 }, { "epoch": 0.7035175879396985, "grad_norm": 7.312016198568584, "learning_rate": 2.9654510556621886e-06, "loss": 0.3672, "step": 13930 }, { "epoch": 0.7040226256912704, "grad_norm": 4.875104979534643, "learning_rate": 2.9604000404081223e-06, "loss": 0.3557, "step": 13940 }, { "epoch": 0.7045276634428423, "grad_norm": 64.55120292569038, "learning_rate": 2.955349025154056e-06, "loss": 0.357, "step": 13950 }, { "epoch": 0.7050327011944143, "grad_norm": 4.765695435475721, "learning_rate": 2.95029800989999e-06, "loss": 0.3625, "step": 13960 }, { "epoch": 0.7055377389459863, "grad_norm": 9.706426400727505, "learning_rate": 2.945246994645924e-06, "loss": 0.3692, "step": 13970 }, { "epoch": 0.7060427766975581, "grad_norm": 5.7703156609987785, "learning_rate": 2.940195979391858e-06, "loss": 0.3496, "step": 13980 }, { "epoch": 0.7065478144491301, "grad_norm": 10.413216240197633, "learning_rate": 2.935144964137792e-06, "loss": 0.3435, "step": 13990 }, { "epoch": 0.707052852200702, "grad_norm": 7.078239510812093, "learning_rate": 2.930093948883726e-06, "loss": 0.3661, "step": 14000 }, { "epoch": 0.707557889952274, "grad_norm": 10.541563423283328, "learning_rate": 2.92504293362966e-06, "loss": 0.3569, "step": 14010 }, { "epoch": 0.7080629277038458, "grad_norm": 4.886876661534467, "learning_rate": 2.9199919183755936e-06, "loss": 0.368, "step": 14020 }, { "epoch": 0.7085679654554178, "grad_norm": 15.262848195897316, "learning_rate": 2.9149409031215277e-06, "loss": 0.3682, "step": 14030 }, { "epoch": 0.7090730032069897, "grad_norm": 12.794932541476035, "learning_rate": 2.9098898878674614e-06, "loss": 0.3592, "step": 14040 }, { "epoch": 0.7095780409585617, "grad_norm": 10.61866026419293, "learning_rate": 2.904838872613395e-06, "loss": 0.3604, "step": 14050 }, { "epoch": 0.7100830787101335, "grad_norm": 25.4476962457588, "learning_rate": 2.8997878573593296e-06, "loss": 0.3824, "step": 14060 }, { "epoch": 0.7105881164617055, "grad_norm": 7.669338040232453, "learning_rate": 2.8947368421052634e-06, "loss": 0.3595, "step": 14070 }, { "epoch": 0.7110931542132775, "grad_norm": 11.74527716840071, "learning_rate": 2.8896858268511975e-06, "loss": 0.3543, "step": 14080 }, { "epoch": 0.7115981919648494, "grad_norm": 6.646635907284065, "learning_rate": 2.884634811597131e-06, "loss": 0.372, "step": 14090 }, { "epoch": 0.7121032297164213, "grad_norm": 7.097527430156463, "learning_rate": 2.879583796343065e-06, "loss": 0.3551, "step": 14100 }, { "epoch": 0.7126082674679932, "grad_norm": 13.591824996252463, "learning_rate": 2.874532781088999e-06, "loss": 0.3681, "step": 14110 }, { "epoch": 0.7131133052195652, "grad_norm": 6.164884039122544, "learning_rate": 2.869481765834933e-06, "loss": 0.3587, "step": 14120 }, { "epoch": 0.7136183429711371, "grad_norm": 4.715084112453213, "learning_rate": 2.8644307505808672e-06, "loss": 0.3573, "step": 14130 }, { "epoch": 0.714123380722709, "grad_norm": 6.717756492187504, "learning_rate": 2.859379735326801e-06, "loss": 0.3686, "step": 14140 }, { "epoch": 0.7146284184742809, "grad_norm": 3.2667847864627855, "learning_rate": 2.854328720072735e-06, "loss": 0.3623, "step": 14150 }, { "epoch": 0.7151334562258529, "grad_norm": 4.553377349461034, "learning_rate": 2.8492777048186688e-06, "loss": 0.3696, "step": 14160 }, { "epoch": 0.7156384939774248, "grad_norm": 6.765157372727796, "learning_rate": 2.8442266895646025e-06, "loss": 0.3828, "step": 14170 }, { "epoch": 0.7161435317289967, "grad_norm": 11.922453637732414, "learning_rate": 2.8391756743105366e-06, "loss": 0.362, "step": 14180 }, { "epoch": 0.7166485694805687, "grad_norm": 14.304651599986355, "learning_rate": 2.8341246590564707e-06, "loss": 0.3609, "step": 14190 }, { "epoch": 0.7171536072321406, "grad_norm": 6.623357145482479, "learning_rate": 2.829073643802405e-06, "loss": 0.3697, "step": 14200 }, { "epoch": 0.7176586449837126, "grad_norm": 8.462478930205787, "learning_rate": 2.8240226285483385e-06, "loss": 0.3909, "step": 14210 }, { "epoch": 0.7181636827352844, "grad_norm": 4.464396808912767, "learning_rate": 2.8189716132942722e-06, "loss": 0.3653, "step": 14220 }, { "epoch": 0.7186687204868564, "grad_norm": 5.427275414848616, "learning_rate": 2.8139205980402064e-06, "loss": 0.3615, "step": 14230 }, { "epoch": 0.7191737582384283, "grad_norm": 6.5602036666267285, "learning_rate": 2.80886958278614e-06, "loss": 0.3793, "step": 14240 }, { "epoch": 0.7196787959900003, "grad_norm": 8.382640709296943, "learning_rate": 2.8038185675320738e-06, "loss": 0.3839, "step": 14250 }, { "epoch": 0.7201838337415721, "grad_norm": 8.38849977200125, "learning_rate": 2.7987675522780083e-06, "loss": 0.3632, "step": 14260 }, { "epoch": 0.7206888714931441, "grad_norm": 4.838800155468274, "learning_rate": 2.793716537023942e-06, "loss": 0.3654, "step": 14270 }, { "epoch": 0.721193909244716, "grad_norm": 3.202140029458944, "learning_rate": 2.788665521769876e-06, "loss": 0.3457, "step": 14280 }, { "epoch": 0.721698946996288, "grad_norm": 3.096801851946618, "learning_rate": 2.78361450651581e-06, "loss": 0.3624, "step": 14290 }, { "epoch": 0.7222039847478599, "grad_norm": 5.2594336987978885, "learning_rate": 2.7785634912617435e-06, "loss": 0.3595, "step": 14300 }, { "epoch": 0.7227090224994318, "grad_norm": 5.738417846740695, "learning_rate": 2.7735124760076777e-06, "loss": 0.3471, "step": 14310 }, { "epoch": 0.7232140602510038, "grad_norm": 3.3615923332479283, "learning_rate": 2.7684614607536114e-06, "loss": 0.3781, "step": 14320 }, { "epoch": 0.7237190980025757, "grad_norm": 13.026933882432127, "learning_rate": 2.763410445499546e-06, "loss": 0.3765, "step": 14330 }, { "epoch": 0.7242241357541476, "grad_norm": 2.4861597750527884, "learning_rate": 2.7583594302454796e-06, "loss": 0.3528, "step": 14340 }, { "epoch": 0.7247291735057195, "grad_norm": 6.6503607629591, "learning_rate": 2.7533084149914137e-06, "loss": 0.3618, "step": 14350 }, { "epoch": 0.7252342112572915, "grad_norm": 2.8124337865494464, "learning_rate": 2.7482573997373474e-06, "loss": 0.3791, "step": 14360 }, { "epoch": 0.7257392490088634, "grad_norm": 5.334310989331839, "learning_rate": 2.743206384483281e-06, "loss": 0.3702, "step": 14370 }, { "epoch": 0.7262442867604354, "grad_norm": 4.197944825898265, "learning_rate": 2.7381553692292153e-06, "loss": 0.3644, "step": 14380 }, { "epoch": 0.7267493245120072, "grad_norm": 3.4074066682001076, "learning_rate": 2.7331043539751494e-06, "loss": 0.3694, "step": 14390 }, { "epoch": 0.7272543622635792, "grad_norm": 2.5186168503160817, "learning_rate": 2.7280533387210835e-06, "loss": 0.3675, "step": 14400 }, { "epoch": 0.7277594000151512, "grad_norm": 2.7527199189922134, "learning_rate": 2.723002323467017e-06, "loss": 0.3474, "step": 14410 }, { "epoch": 0.7282644377667231, "grad_norm": 10.29943264391808, "learning_rate": 2.717951308212951e-06, "loss": 0.3627, "step": 14420 }, { "epoch": 0.728769475518295, "grad_norm": 3.012910231072848, "learning_rate": 2.712900292958885e-06, "loss": 0.359, "step": 14430 }, { "epoch": 0.7292745132698669, "grad_norm": 4.291318577581795, "learning_rate": 2.7078492777048187e-06, "loss": 0.3657, "step": 14440 }, { "epoch": 0.7297795510214389, "grad_norm": 3.161513732407597, "learning_rate": 2.7027982624507524e-06, "loss": 0.3774, "step": 14450 }, { "epoch": 0.7302845887730108, "grad_norm": 6.231424663495901, "learning_rate": 2.697747247196687e-06, "loss": 0.3496, "step": 14460 }, { "epoch": 0.7307896265245827, "grad_norm": 9.93800800520396, "learning_rate": 2.6926962319426207e-06, "loss": 0.3574, "step": 14470 }, { "epoch": 0.7312946642761546, "grad_norm": 5.792304139685358, "learning_rate": 2.687645216688555e-06, "loss": 0.3641, "step": 14480 }, { "epoch": 0.7317997020277266, "grad_norm": 6.779155670529348, "learning_rate": 2.6825942014344885e-06, "loss": 0.3545, "step": 14490 }, { "epoch": 0.7323047397792986, "grad_norm": 3.8135773883209527, "learning_rate": 2.6775431861804226e-06, "loss": 0.3722, "step": 14500 }, { "epoch": 0.7328097775308704, "grad_norm": 3.3544383619935805, "learning_rate": 2.6724921709263563e-06, "loss": 0.3603, "step": 14510 }, { "epoch": 0.7333148152824424, "grad_norm": 4.268409877609442, "learning_rate": 2.66744115567229e-06, "loss": 0.3682, "step": 14520 }, { "epoch": 0.7338198530340143, "grad_norm": 6.571521428968219, "learning_rate": 2.6623901404182246e-06, "loss": 0.376, "step": 14530 }, { "epoch": 0.7343248907855863, "grad_norm": 2.9026381605364935, "learning_rate": 2.6573391251641583e-06, "loss": 0.3705, "step": 14540 }, { "epoch": 0.7348299285371581, "grad_norm": 3.6365813772686635, "learning_rate": 2.6522881099100924e-06, "loss": 0.3556, "step": 14550 }, { "epoch": 0.7353349662887301, "grad_norm": 4.744776659753617, "learning_rate": 2.647237094656026e-06, "loss": 0.381, "step": 14560 }, { "epoch": 0.735840004040302, "grad_norm": 3.8265663489625172, "learning_rate": 2.64218607940196e-06, "loss": 0.3675, "step": 14570 }, { "epoch": 0.736345041791874, "grad_norm": 4.046535114955657, "learning_rate": 2.637135064147894e-06, "loss": 0.354, "step": 14580 }, { "epoch": 0.7368500795434458, "grad_norm": 2.011858995754094, "learning_rate": 2.6320840488938276e-06, "loss": 0.3674, "step": 14590 }, { "epoch": 0.7373551172950178, "grad_norm": 8.920805773112406, "learning_rate": 2.627033033639762e-06, "loss": 0.3479, "step": 14600 }, { "epoch": 0.7378601550465897, "grad_norm": 3.136158587893057, "learning_rate": 2.621982018385696e-06, "loss": 0.3626, "step": 14610 }, { "epoch": 0.7383651927981617, "grad_norm": 3.2282462284034175, "learning_rate": 2.6169310031316296e-06, "loss": 0.3612, "step": 14620 }, { "epoch": 0.7388702305497336, "grad_norm": 2.7063221534655018, "learning_rate": 2.6118799878775637e-06, "loss": 0.3658, "step": 14630 }, { "epoch": 0.7393752683013055, "grad_norm": 2.0705371802094663, "learning_rate": 2.6068289726234974e-06, "loss": 0.3619, "step": 14640 }, { "epoch": 0.7398803060528775, "grad_norm": 2.452888217750629, "learning_rate": 2.601777957369431e-06, "loss": 0.3745, "step": 14650 }, { "epoch": 0.7403853438044494, "grad_norm": 3.5210431082573987, "learning_rate": 2.596726942115365e-06, "loss": 0.3621, "step": 14660 }, { "epoch": 0.7408903815560213, "grad_norm": 2.705820608986338, "learning_rate": 2.5916759268612993e-06, "loss": 0.3722, "step": 14670 }, { "epoch": 0.7413954193075932, "grad_norm": 3.8301828984326005, "learning_rate": 2.5866249116072335e-06, "loss": 0.3475, "step": 14680 }, { "epoch": 0.7419004570591652, "grad_norm": 2.506550997138522, "learning_rate": 2.581573896353167e-06, "loss": 0.379, "step": 14690 }, { "epoch": 0.7424054948107371, "grad_norm": 4.732673442087478, "learning_rate": 2.5765228810991013e-06, "loss": 0.3646, "step": 14700 }, { "epoch": 0.742910532562309, "grad_norm": 3.5697728957238297, "learning_rate": 2.571471865845035e-06, "loss": 0.3749, "step": 14710 }, { "epoch": 0.743415570313881, "grad_norm": 3.408786643280019, "learning_rate": 2.5664208505909687e-06, "loss": 0.3729, "step": 14720 }, { "epoch": 0.7439206080654529, "grad_norm": 3.2680972265488584, "learning_rate": 2.5613698353369032e-06, "loss": 0.3709, "step": 14730 }, { "epoch": 0.7444256458170249, "grad_norm": 2.830354639883803, "learning_rate": 2.556318820082837e-06, "loss": 0.3575, "step": 14740 }, { "epoch": 0.7449306835685967, "grad_norm": 2.1934046044941216, "learning_rate": 2.551267804828771e-06, "loss": 0.3615, "step": 14750 }, { "epoch": 0.7454357213201687, "grad_norm": 3.7209556097533363, "learning_rate": 2.5462167895747048e-06, "loss": 0.3759, "step": 14760 }, { "epoch": 0.7459407590717406, "grad_norm": 2.171529108489969, "learning_rate": 2.5411657743206385e-06, "loss": 0.3736, "step": 14770 }, { "epoch": 0.7464457968233126, "grad_norm": 3.2881080411249903, "learning_rate": 2.5361147590665726e-06, "loss": 0.3856, "step": 14780 }, { "epoch": 0.7469508345748844, "grad_norm": 2.1841022217022004, "learning_rate": 2.5310637438125063e-06, "loss": 0.3594, "step": 14790 }, { "epoch": 0.7474558723264564, "grad_norm": 11.74984061570617, "learning_rate": 2.526012728558441e-06, "loss": 0.3691, "step": 14800 }, { "epoch": 0.7479609100780283, "grad_norm": 2.64035990962851, "learning_rate": 2.5209617133043745e-06, "loss": 0.3426, "step": 14810 }, { "epoch": 0.7484659478296003, "grad_norm": 1.7507289069126455, "learning_rate": 2.5159106980503082e-06, "loss": 0.3652, "step": 14820 }, { "epoch": 0.7489709855811721, "grad_norm": 2.720025461643553, "learning_rate": 2.5108596827962423e-06, "loss": 0.3757, "step": 14830 }, { "epoch": 0.7494760233327441, "grad_norm": 2.595489021549687, "learning_rate": 2.505808667542176e-06, "loss": 0.3659, "step": 14840 }, { "epoch": 0.7499810610843161, "grad_norm": 3.4867867255116884, "learning_rate": 2.50075765228811e-06, "loss": 0.3733, "step": 14850 }, { "epoch": 0.750486098835888, "grad_norm": 2.716151183546756, "learning_rate": 2.4957066370340443e-06, "loss": 0.3609, "step": 14860 }, { "epoch": 0.7509911365874599, "grad_norm": 2.0400103144299737, "learning_rate": 2.490655621779978e-06, "loss": 0.3639, "step": 14870 }, { "epoch": 0.7514961743390318, "grad_norm": 9.410235472131538, "learning_rate": 2.4856046065259117e-06, "loss": 0.3483, "step": 14880 }, { "epoch": 0.7520012120906038, "grad_norm": 2.4049800584504584, "learning_rate": 2.480553591271846e-06, "loss": 0.3569, "step": 14890 }, { "epoch": 0.7525062498421757, "grad_norm": 3.7384168124978565, "learning_rate": 2.47550257601778e-06, "loss": 0.3542, "step": 14900 }, { "epoch": 0.7530112875937476, "grad_norm": 2.589659210465427, "learning_rate": 2.4704515607637136e-06, "loss": 0.368, "step": 14910 }, { "epoch": 0.7535163253453195, "grad_norm": 2.2792717912641383, "learning_rate": 2.4654005455096478e-06, "loss": 0.3575, "step": 14920 }, { "epoch": 0.7540213630968915, "grad_norm": 5.546504545797571, "learning_rate": 2.4603495302555815e-06, "loss": 0.361, "step": 14930 }, { "epoch": 0.7545264008484635, "grad_norm": 4.5236138626508415, "learning_rate": 2.4552985150015156e-06, "loss": 0.3652, "step": 14940 }, { "epoch": 0.7550314386000353, "grad_norm": 5.458941587498548, "learning_rate": 2.4502474997474497e-06, "loss": 0.3565, "step": 14950 }, { "epoch": 0.7555364763516073, "grad_norm": 2.182667398028913, "learning_rate": 2.4451964844933834e-06, "loss": 0.3574, "step": 14960 }, { "epoch": 0.7560415141031792, "grad_norm": 2.3501955437712563, "learning_rate": 2.440145469239317e-06, "loss": 0.3409, "step": 14970 }, { "epoch": 0.7565465518547512, "grad_norm": 2.8564528610222473, "learning_rate": 2.4350944539852512e-06, "loss": 0.3629, "step": 14980 }, { "epoch": 0.7570515896063231, "grad_norm": 3.888970347389462, "learning_rate": 2.4300434387311854e-06, "loss": 0.361, "step": 14990 }, { "epoch": 0.757556627357895, "grad_norm": 7.312481133609781, "learning_rate": 2.424992423477119e-06, "loss": 0.3614, "step": 15000 }, { "epoch": 0.7580616651094669, "grad_norm": 2.1317079758826813, "learning_rate": 2.4199414082230528e-06, "loss": 0.3738, "step": 15010 }, { "epoch": 0.7585667028610389, "grad_norm": 3.6516684684220366, "learning_rate": 2.414890392968987e-06, "loss": 0.3648, "step": 15020 }, { "epoch": 0.7590717406126108, "grad_norm": 2.6072438507069555, "learning_rate": 2.409839377714921e-06, "loss": 0.3621, "step": 15030 }, { "epoch": 0.7595767783641827, "grad_norm": 6.351203220775499, "learning_rate": 2.4047883624608547e-06, "loss": 0.3558, "step": 15040 }, { "epoch": 0.7600818161157546, "grad_norm": 4.130236652741164, "learning_rate": 2.399737347206789e-06, "loss": 0.369, "step": 15050 }, { "epoch": 0.7605868538673266, "grad_norm": 4.6672030495120564, "learning_rate": 2.394686331952723e-06, "loss": 0.3622, "step": 15060 }, { "epoch": 0.7610918916188986, "grad_norm": 12.886235429525053, "learning_rate": 2.3896353166986567e-06, "loss": 0.3622, "step": 15070 }, { "epoch": 0.7615969293704704, "grad_norm": 15.156964415943117, "learning_rate": 2.3845843014445904e-06, "loss": 0.3467, "step": 15080 }, { "epoch": 0.7621019671220424, "grad_norm": 2.87845766689377, "learning_rate": 2.3795332861905245e-06, "loss": 0.3505, "step": 15090 }, { "epoch": 0.7626070048736143, "grad_norm": 15.116534480941452, "learning_rate": 2.3744822709364586e-06, "loss": 0.3483, "step": 15100 }, { "epoch": 0.7631120426251863, "grad_norm": 7.257991090958527, "learning_rate": 2.3694312556823923e-06, "loss": 0.3822, "step": 15110 }, { "epoch": 0.7636170803767581, "grad_norm": 3.4000262016509697, "learning_rate": 2.3643802404283264e-06, "loss": 0.3609, "step": 15120 }, { "epoch": 0.7641221181283301, "grad_norm": 5.8302722893046734, "learning_rate": 2.35932922517426e-06, "loss": 0.3532, "step": 15130 }, { "epoch": 0.764627155879902, "grad_norm": 2.980190177193747, "learning_rate": 2.3542782099201943e-06, "loss": 0.3532, "step": 15140 }, { "epoch": 0.765132193631474, "grad_norm": 2.886158227100219, "learning_rate": 2.349227194666128e-06, "loss": 0.3747, "step": 15150 }, { "epoch": 0.7656372313830458, "grad_norm": 3.5525222031207635, "learning_rate": 2.344176179412062e-06, "loss": 0.3627, "step": 15160 }, { "epoch": 0.7661422691346178, "grad_norm": 5.160435812895103, "learning_rate": 2.3391251641579958e-06, "loss": 0.3638, "step": 15170 }, { "epoch": 0.7666473068861898, "grad_norm": 11.69613721594715, "learning_rate": 2.33407414890393e-06, "loss": 0.354, "step": 15180 }, { "epoch": 0.7671523446377617, "grad_norm": 2.5755050921925604, "learning_rate": 2.329023133649864e-06, "loss": 0.3571, "step": 15190 }, { "epoch": 0.7676573823893336, "grad_norm": 6.875578222537554, "learning_rate": 2.3239721183957977e-06, "loss": 0.3494, "step": 15200 }, { "epoch": 0.7681624201409055, "grad_norm": 3.951283263659072, "learning_rate": 2.3189211031417314e-06, "loss": 0.3642, "step": 15210 }, { "epoch": 0.7686674578924775, "grad_norm": 6.490351712773479, "learning_rate": 2.3138700878876655e-06, "loss": 0.3625, "step": 15220 }, { "epoch": 0.7691724956440494, "grad_norm": 6.723804099342641, "learning_rate": 2.3088190726335997e-06, "loss": 0.3542, "step": 15230 }, { "epoch": 0.7696775333956213, "grad_norm": 3.266407086981815, "learning_rate": 2.3037680573795334e-06, "loss": 0.3593, "step": 15240 }, { "epoch": 0.7701825711471932, "grad_norm": 3.9284527391422093, "learning_rate": 2.2987170421254675e-06, "loss": 0.3614, "step": 15250 }, { "epoch": 0.7706876088987652, "grad_norm": 15.223624137338168, "learning_rate": 2.2936660268714016e-06, "loss": 0.3662, "step": 15260 }, { "epoch": 0.7711926466503372, "grad_norm": 4.028959949491209, "learning_rate": 2.2886150116173353e-06, "loss": 0.3499, "step": 15270 }, { "epoch": 0.771697684401909, "grad_norm": 6.41110230244689, "learning_rate": 2.283563996363269e-06, "loss": 0.3753, "step": 15280 }, { "epoch": 0.772202722153481, "grad_norm": 4.076986580941685, "learning_rate": 2.278512981109203e-06, "loss": 0.334, "step": 15290 }, { "epoch": 0.7727077599050529, "grad_norm": 3.74862169818398, "learning_rate": 2.2734619658551373e-06, "loss": 0.3491, "step": 15300 }, { "epoch": 0.7732127976566249, "grad_norm": 3.385686236758005, "learning_rate": 2.268410950601071e-06, "loss": 0.3505, "step": 15310 }, { "epoch": 0.7737178354081967, "grad_norm": 4.72702362279656, "learning_rate": 2.2633599353470047e-06, "loss": 0.362, "step": 15320 }, { "epoch": 0.7742228731597687, "grad_norm": 15.01423003858265, "learning_rate": 2.2583089200929388e-06, "loss": 0.3557, "step": 15330 }, { "epoch": 0.7747279109113406, "grad_norm": 7.0698788662071275, "learning_rate": 2.253257904838873e-06, "loss": 0.3414, "step": 15340 }, { "epoch": 0.7752329486629126, "grad_norm": 3.524087806752161, "learning_rate": 2.2482068895848066e-06, "loss": 0.372, "step": 15350 }, { "epoch": 0.7757379864144844, "grad_norm": 3.595099576572771, "learning_rate": 2.2431558743307407e-06, "loss": 0.3621, "step": 15360 }, { "epoch": 0.7762430241660564, "grad_norm": 3.3403877064629786, "learning_rate": 2.2381048590766744e-06, "loss": 0.3565, "step": 15370 }, { "epoch": 0.7767480619176284, "grad_norm": 5.0138361529572, "learning_rate": 2.2330538438226086e-06, "loss": 0.374, "step": 15380 }, { "epoch": 0.7772530996692003, "grad_norm": 6.076755730180162, "learning_rate": 2.2280028285685423e-06, "loss": 0.3548, "step": 15390 }, { "epoch": 0.7777581374207722, "grad_norm": 7.698809050653406, "learning_rate": 2.2229518133144764e-06, "loss": 0.3667, "step": 15400 }, { "epoch": 0.7782631751723441, "grad_norm": 3.4467924224973046, "learning_rate": 2.2179007980604105e-06, "loss": 0.3628, "step": 15410 }, { "epoch": 0.7787682129239161, "grad_norm": 4.893559485834325, "learning_rate": 2.212849782806344e-06, "loss": 0.3644, "step": 15420 }, { "epoch": 0.779273250675488, "grad_norm": 4.646808808665608, "learning_rate": 2.2077987675522783e-06, "loss": 0.3615, "step": 15430 }, { "epoch": 0.7797782884270599, "grad_norm": 5.1159354751441395, "learning_rate": 2.202747752298212e-06, "loss": 0.3719, "step": 15440 }, { "epoch": 0.7802833261786318, "grad_norm": 3.6881998697625784, "learning_rate": 2.197696737044146e-06, "loss": 0.3607, "step": 15450 }, { "epoch": 0.7807883639302038, "grad_norm": 2.337125255187875, "learning_rate": 2.1926457217900803e-06, "loss": 0.3757, "step": 15460 }, { "epoch": 0.7812934016817757, "grad_norm": 4.428433133367473, "learning_rate": 2.187594706536014e-06, "loss": 0.3588, "step": 15470 }, { "epoch": 0.7817984394333476, "grad_norm": 3.458785718908454, "learning_rate": 2.1825436912819477e-06, "loss": 0.3705, "step": 15480 }, { "epoch": 0.7823034771849195, "grad_norm": 3.5942204693831936, "learning_rate": 2.177492676027882e-06, "loss": 0.3683, "step": 15490 }, { "epoch": 0.7828085149364915, "grad_norm": 5.889169421367107, "learning_rate": 2.172441660773816e-06, "loss": 0.3573, "step": 15500 }, { "epoch": 0.7833135526880635, "grad_norm": 3.261189892920889, "learning_rate": 2.1673906455197496e-06, "loss": 0.3512, "step": 15510 }, { "epoch": 0.7838185904396353, "grad_norm": 8.352842752027808, "learning_rate": 2.1623396302656833e-06, "loss": 0.3629, "step": 15520 }, { "epoch": 0.7843236281912073, "grad_norm": 2.96386077067302, "learning_rate": 2.1572886150116174e-06, "loss": 0.3636, "step": 15530 }, { "epoch": 0.7848286659427792, "grad_norm": 5.283581255544922, "learning_rate": 2.1522375997575516e-06, "loss": 0.3676, "step": 15540 }, { "epoch": 0.7853337036943512, "grad_norm": 2.6179107461634796, "learning_rate": 2.1471865845034853e-06, "loss": 0.354, "step": 15550 }, { "epoch": 0.785838741445923, "grad_norm": 7.9036355130392755, "learning_rate": 2.142135569249419e-06, "loss": 0.3621, "step": 15560 }, { "epoch": 0.786343779197495, "grad_norm": 4.522869551719609, "learning_rate": 2.137084553995353e-06, "loss": 0.3718, "step": 15570 }, { "epoch": 0.7868488169490669, "grad_norm": 4.684014168319032, "learning_rate": 2.1320335387412872e-06, "loss": 0.3543, "step": 15580 }, { "epoch": 0.7873538547006389, "grad_norm": 3.8914673062545697, "learning_rate": 2.126982523487221e-06, "loss": 0.3535, "step": 15590 }, { "epoch": 0.7878588924522109, "grad_norm": 5.961270787701582, "learning_rate": 2.121931508233155e-06, "loss": 0.3716, "step": 15600 }, { "epoch": 0.7883639302037827, "grad_norm": 30.183292311995285, "learning_rate": 2.116880492979089e-06, "loss": 0.3613, "step": 15610 }, { "epoch": 0.7888689679553547, "grad_norm": 6.697417465261187, "learning_rate": 2.111829477725023e-06, "loss": 0.3507, "step": 15620 }, { "epoch": 0.7893740057069266, "grad_norm": 3.0372920526034783, "learning_rate": 2.106778462470957e-06, "loss": 0.3556, "step": 15630 }, { "epoch": 0.7898790434584986, "grad_norm": 2.8257416604165764, "learning_rate": 2.1017274472168907e-06, "loss": 0.3558, "step": 15640 }, { "epoch": 0.7903840812100704, "grad_norm": 4.371382594369755, "learning_rate": 2.096676431962825e-06, "loss": 0.3708, "step": 15650 }, { "epoch": 0.7908891189616424, "grad_norm": 2.2916826306015388, "learning_rate": 2.0916254167087585e-06, "loss": 0.3575, "step": 15660 }, { "epoch": 0.7913941567132143, "grad_norm": 4.127089263253249, "learning_rate": 2.0865744014546926e-06, "loss": 0.3727, "step": 15670 }, { "epoch": 0.7918991944647863, "grad_norm": 4.5838298238269095, "learning_rate": 2.0815233862006263e-06, "loss": 0.3631, "step": 15680 }, { "epoch": 0.7924042322163581, "grad_norm": 2.13347121181111, "learning_rate": 2.0764723709465605e-06, "loss": 0.3577, "step": 15690 }, { "epoch": 0.7929092699679301, "grad_norm": 2.865333714554876, "learning_rate": 2.0714213556924946e-06, "loss": 0.3646, "step": 15700 }, { "epoch": 0.793414307719502, "grad_norm": 3.3712098781575968, "learning_rate": 2.0663703404384283e-06, "loss": 0.3665, "step": 15710 }, { "epoch": 0.793919345471074, "grad_norm": 6.140656487678135, "learning_rate": 2.061319325184362e-06, "loss": 0.3562, "step": 15720 }, { "epoch": 0.7944243832226459, "grad_norm": 6.1891047783662785, "learning_rate": 2.056268309930296e-06, "loss": 0.3724, "step": 15730 }, { "epoch": 0.7949294209742178, "grad_norm": 9.134267646827375, "learning_rate": 2.0512172946762302e-06, "loss": 0.3733, "step": 15740 }, { "epoch": 0.7954344587257898, "grad_norm": 3.939505362182306, "learning_rate": 2.046166279422164e-06, "loss": 0.356, "step": 15750 }, { "epoch": 0.7959394964773617, "grad_norm": 5.797259273365989, "learning_rate": 2.041115264168098e-06, "loss": 0.3657, "step": 15760 }, { "epoch": 0.7964445342289336, "grad_norm": 4.772342944803025, "learning_rate": 2.036064248914032e-06, "loss": 0.3565, "step": 15770 }, { "epoch": 0.7969495719805055, "grad_norm": 5.611273163023503, "learning_rate": 2.031013233659966e-06, "loss": 0.342, "step": 15780 }, { "epoch": 0.7974546097320775, "grad_norm": 4.35699870135289, "learning_rate": 2.0259622184058996e-06, "loss": 0.3564, "step": 15790 }, { "epoch": 0.7979596474836494, "grad_norm": 4.981366634684693, "learning_rate": 2.0209112031518337e-06, "loss": 0.374, "step": 15800 }, { "epoch": 0.7984646852352213, "grad_norm": 5.47051690828411, "learning_rate": 2.015860187897768e-06, "loss": 0.3648, "step": 15810 }, { "epoch": 0.7989697229867933, "grad_norm": 2.7267511419608192, "learning_rate": 2.0108091726437015e-06, "loss": 0.3633, "step": 15820 }, { "epoch": 0.7994747607383652, "grad_norm": 5.550689226148647, "learning_rate": 2.0057581573896352e-06, "loss": 0.3729, "step": 15830 }, { "epoch": 0.7999797984899372, "grad_norm": 3.302912522583322, "learning_rate": 2.0007071421355694e-06, "loss": 0.3594, "step": 15840 }, { "epoch": 0.800484836241509, "grad_norm": 5.487680267201285, "learning_rate": 1.9956561268815035e-06, "loss": 0.3529, "step": 15850 }, { "epoch": 0.800989873993081, "grad_norm": 2.2707759816513016, "learning_rate": 1.990605111627437e-06, "loss": 0.3608, "step": 15860 }, { "epoch": 0.8014949117446529, "grad_norm": 7.6358774792742805, "learning_rate": 1.9855540963733713e-06, "loss": 0.3664, "step": 15870 }, { "epoch": 0.8019999494962249, "grad_norm": 4.568057200617405, "learning_rate": 1.980503081119305e-06, "loss": 0.3625, "step": 15880 }, { "epoch": 0.8025049872477967, "grad_norm": 15.31945090537045, "learning_rate": 1.975452065865239e-06, "loss": 0.3489, "step": 15890 }, { "epoch": 0.8030100249993687, "grad_norm": 8.04419756034429, "learning_rate": 1.970401050611173e-06, "loss": 0.3658, "step": 15900 }, { "epoch": 0.8035150627509406, "grad_norm": 2.473274324466601, "learning_rate": 1.965350035357107e-06, "loss": 0.375, "step": 15910 }, { "epoch": 0.8040201005025126, "grad_norm": 2.6342811148775076, "learning_rate": 1.9602990201030406e-06, "loss": 0.3636, "step": 15920 }, { "epoch": 0.8045251382540844, "grad_norm": 2.6430322700309214, "learning_rate": 1.9552480048489748e-06, "loss": 0.3533, "step": 15930 }, { "epoch": 0.8050301760056564, "grad_norm": 4.757917337464147, "learning_rate": 1.950196989594909e-06, "loss": 0.3727, "step": 15940 }, { "epoch": 0.8055352137572284, "grad_norm": 4.511805579600671, "learning_rate": 1.9451459743408426e-06, "loss": 0.337, "step": 15950 }, { "epoch": 0.8060402515088003, "grad_norm": 4.838220936104523, "learning_rate": 1.9400949590867767e-06, "loss": 0.3577, "step": 15960 }, { "epoch": 0.8065452892603722, "grad_norm": 16.008417651939627, "learning_rate": 1.935043943832711e-06, "loss": 0.3475, "step": 15970 }, { "epoch": 0.8070503270119441, "grad_norm": 6.303632411143941, "learning_rate": 1.9299929285786445e-06, "loss": 0.3659, "step": 15980 }, { "epoch": 0.8075553647635161, "grad_norm": 5.190471720901927, "learning_rate": 1.9249419133245782e-06, "loss": 0.3546, "step": 15990 }, { "epoch": 0.808060402515088, "grad_norm": 3.242336430162645, "learning_rate": 1.9198908980705124e-06, "loss": 0.3509, "step": 16000 }, { "epoch": 0.8085654402666599, "grad_norm": 109.07082323352427, "learning_rate": 1.9148398828164465e-06, "loss": 0.3576, "step": 16010 }, { "epoch": 0.8090704780182318, "grad_norm": 4.924092447335461, "learning_rate": 1.90978886756238e-06, "loss": 0.3513, "step": 16020 }, { "epoch": 0.8095755157698038, "grad_norm": 4.008651128754475, "learning_rate": 1.904737852308314e-06, "loss": 0.3632, "step": 16030 }, { "epoch": 0.8100805535213758, "grad_norm": 2.704598654274715, "learning_rate": 1.8996868370542482e-06, "loss": 0.3789, "step": 16040 }, { "epoch": 0.8105855912729476, "grad_norm": 3.9535618478053838, "learning_rate": 1.894635821800182e-06, "loss": 0.366, "step": 16050 }, { "epoch": 0.8110906290245196, "grad_norm": 2.647502432727123, "learning_rate": 1.8895848065461158e-06, "loss": 0.3591, "step": 16060 }, { "epoch": 0.8115956667760915, "grad_norm": 3.1410244190511296, "learning_rate": 1.8845337912920497e-06, "loss": 0.371, "step": 16070 }, { "epoch": 0.8121007045276635, "grad_norm": 2.681292853976926, "learning_rate": 1.8794827760379839e-06, "loss": 0.3625, "step": 16080 }, { "epoch": 0.8126057422792353, "grad_norm": 3.2981049374910203, "learning_rate": 1.8744317607839178e-06, "loss": 0.3524, "step": 16090 }, { "epoch": 0.8131107800308073, "grad_norm": 3.021432379433984, "learning_rate": 1.8693807455298515e-06, "loss": 0.3787, "step": 16100 }, { "epoch": 0.8136158177823792, "grad_norm": 4.129295733061501, "learning_rate": 1.8643297302757856e-06, "loss": 0.3565, "step": 16110 }, { "epoch": 0.8141208555339512, "grad_norm": 2.7563655099935587, "learning_rate": 1.8592787150217195e-06, "loss": 0.3594, "step": 16120 }, { "epoch": 0.814625893285523, "grad_norm": 17.34216993322677, "learning_rate": 1.8542276997676534e-06, "loss": 0.364, "step": 16130 }, { "epoch": 0.815130931037095, "grad_norm": 9.964020505556881, "learning_rate": 1.8491766845135876e-06, "loss": 0.365, "step": 16140 }, { "epoch": 0.815635968788667, "grad_norm": 3.075416765785676, "learning_rate": 1.8441256692595213e-06, "loss": 0.3635, "step": 16150 }, { "epoch": 0.8161410065402389, "grad_norm": 5.243244426066463, "learning_rate": 1.8390746540054552e-06, "loss": 0.3684, "step": 16160 }, { "epoch": 0.8166460442918108, "grad_norm": 12.518246409079454, "learning_rate": 1.834023638751389e-06, "loss": 0.3622, "step": 16170 }, { "epoch": 0.8171510820433827, "grad_norm": 7.312613998018914, "learning_rate": 1.8289726234973232e-06, "loss": 0.3657, "step": 16180 }, { "epoch": 0.8176561197949547, "grad_norm": 3.588424798034187, "learning_rate": 1.8239216082432571e-06, "loss": 0.3704, "step": 16190 }, { "epoch": 0.8181611575465266, "grad_norm": 5.675252481804342, "learning_rate": 1.8188705929891908e-06, "loss": 0.3769, "step": 16200 }, { "epoch": 0.8186661952980986, "grad_norm": 4.439591443663197, "learning_rate": 1.813819577735125e-06, "loss": 0.3613, "step": 16210 }, { "epoch": 0.8191712330496704, "grad_norm": 3.8799599946003904, "learning_rate": 1.8087685624810589e-06, "loss": 0.3507, "step": 16220 }, { "epoch": 0.8196762708012424, "grad_norm": 6.706337065885557, "learning_rate": 1.8037175472269928e-06, "loss": 0.3524, "step": 16230 }, { "epoch": 0.8201813085528143, "grad_norm": 9.215432805836578, "learning_rate": 1.7986665319729269e-06, "loss": 0.3503, "step": 16240 }, { "epoch": 0.8206863463043863, "grad_norm": 14.076599744963326, "learning_rate": 1.7936155167188608e-06, "loss": 0.3628, "step": 16250 }, { "epoch": 0.8211913840559582, "grad_norm": 3.743352271180992, "learning_rate": 1.7885645014647945e-06, "loss": 0.3661, "step": 16260 }, { "epoch": 0.8216964218075301, "grad_norm": 5.7001568185616085, "learning_rate": 1.7835134862107284e-06, "loss": 0.3549, "step": 16270 }, { "epoch": 0.8222014595591021, "grad_norm": 7.558296626118487, "learning_rate": 1.7784624709566625e-06, "loss": 0.3595, "step": 16280 }, { "epoch": 0.822706497310674, "grad_norm": 4.177362518260731, "learning_rate": 1.7734114557025964e-06, "loss": 0.3649, "step": 16290 }, { "epoch": 0.8232115350622459, "grad_norm": 3.640941034887769, "learning_rate": 1.7683604404485301e-06, "loss": 0.3545, "step": 16300 }, { "epoch": 0.8237165728138178, "grad_norm": 3.349967752869647, "learning_rate": 1.7633094251944643e-06, "loss": 0.3489, "step": 16310 }, { "epoch": 0.8242216105653898, "grad_norm": 8.902832535316834, "learning_rate": 1.7582584099403982e-06, "loss": 0.3613, "step": 16320 }, { "epoch": 0.8247266483169617, "grad_norm": 4.87569549754752, "learning_rate": 1.753207394686332e-06, "loss": 0.3466, "step": 16330 }, { "epoch": 0.8252316860685336, "grad_norm": 3.113899384550637, "learning_rate": 1.748156379432266e-06, "loss": 0.356, "step": 16340 }, { "epoch": 0.8257367238201055, "grad_norm": 3.908258669462424, "learning_rate": 1.7431053641782001e-06, "loss": 0.3542, "step": 16350 }, { "epoch": 0.8262417615716775, "grad_norm": 4.63530589205007, "learning_rate": 1.7380543489241338e-06, "loss": 0.3627, "step": 16360 }, { "epoch": 0.8267467993232495, "grad_norm": 8.784657750253414, "learning_rate": 1.7330033336700677e-06, "loss": 0.3578, "step": 16370 }, { "epoch": 0.8272518370748213, "grad_norm": 6.545444728348479, "learning_rate": 1.7279523184160019e-06, "loss": 0.3539, "step": 16380 }, { "epoch": 0.8277568748263933, "grad_norm": 12.650331415485839, "learning_rate": 1.7229013031619358e-06, "loss": 0.3477, "step": 16390 }, { "epoch": 0.8282619125779652, "grad_norm": 6.830225414304928, "learning_rate": 1.7178502879078695e-06, "loss": 0.3503, "step": 16400 }, { "epoch": 0.8287669503295372, "grad_norm": 4.071173355925771, "learning_rate": 1.7127992726538036e-06, "loss": 0.3556, "step": 16410 }, { "epoch": 0.829271988081109, "grad_norm": 4.66654000796166, "learning_rate": 1.7077482573997375e-06, "loss": 0.3639, "step": 16420 }, { "epoch": 0.829777025832681, "grad_norm": 3.6997372517957183, "learning_rate": 1.7026972421456714e-06, "loss": 0.3624, "step": 16430 }, { "epoch": 0.8302820635842529, "grad_norm": 2.8900578489503537, "learning_rate": 1.6976462268916053e-06, "loss": 0.352, "step": 16440 }, { "epoch": 0.8307871013358249, "grad_norm": 7.911446253848937, "learning_rate": 1.6925952116375395e-06, "loss": 0.3439, "step": 16450 }, { "epoch": 0.8312921390873967, "grad_norm": 4.387449984988524, "learning_rate": 1.6875441963834732e-06, "loss": 0.3556, "step": 16460 }, { "epoch": 0.8317971768389687, "grad_norm": 3.423134745091436, "learning_rate": 1.682493181129407e-06, "loss": 0.3534, "step": 16470 }, { "epoch": 0.8323022145905407, "grad_norm": 6.067996848489734, "learning_rate": 1.6774421658753412e-06, "loss": 0.3579, "step": 16480 }, { "epoch": 0.8328072523421126, "grad_norm": 4.7407053967222, "learning_rate": 1.6723911506212751e-06, "loss": 0.3479, "step": 16490 }, { "epoch": 0.8333122900936845, "grad_norm": 11.91783824543377, "learning_rate": 1.6673401353672088e-06, "loss": 0.3457, "step": 16500 }, { "epoch": 0.8338173278452564, "grad_norm": 4.2675850830619115, "learning_rate": 1.6622891201131427e-06, "loss": 0.3652, "step": 16510 }, { "epoch": 0.8343223655968284, "grad_norm": 6.818897096480064, "learning_rate": 1.6572381048590768e-06, "loss": 0.3758, "step": 16520 }, { "epoch": 0.8348274033484003, "grad_norm": 4.98946971011121, "learning_rate": 1.6521870896050108e-06, "loss": 0.3701, "step": 16530 }, { "epoch": 0.8353324410999722, "grad_norm": 9.186502501610118, "learning_rate": 1.6471360743509447e-06, "loss": 0.3621, "step": 16540 }, { "epoch": 0.8358374788515441, "grad_norm": 4.424346513819785, "learning_rate": 1.6420850590968788e-06, "loss": 0.355, "step": 16550 }, { "epoch": 0.8363425166031161, "grad_norm": 6.493343915545971, "learning_rate": 1.6370340438428125e-06, "loss": 0.3606, "step": 16560 }, { "epoch": 0.836847554354688, "grad_norm": 9.200578558866452, "learning_rate": 1.6319830285887464e-06, "loss": 0.3586, "step": 16570 }, { "epoch": 0.8373525921062599, "grad_norm": 3.604785576320196, "learning_rate": 1.6269320133346805e-06, "loss": 0.3659, "step": 16580 }, { "epoch": 0.8378576298578319, "grad_norm": 6.963237973002329, "learning_rate": 1.6218809980806144e-06, "loss": 0.3585, "step": 16590 }, { "epoch": 0.8383626676094038, "grad_norm": 2.3248924867977525, "learning_rate": 1.6168299828265481e-06, "loss": 0.361, "step": 16600 }, { "epoch": 0.8388677053609758, "grad_norm": 2.6056154769998776, "learning_rate": 1.611778967572482e-06, "loss": 0.3551, "step": 16610 }, { "epoch": 0.8393727431125476, "grad_norm": 5.347914332983081, "learning_rate": 1.6067279523184162e-06, "loss": 0.3626, "step": 16620 }, { "epoch": 0.8398777808641196, "grad_norm": 4.2282210080509275, "learning_rate": 1.60167693706435e-06, "loss": 0.3408, "step": 16630 }, { "epoch": 0.8403828186156915, "grad_norm": 64.68616959956208, "learning_rate": 1.596625921810284e-06, "loss": 0.361, "step": 16640 }, { "epoch": 0.8408878563672635, "grad_norm": 4.552604163372721, "learning_rate": 1.5915749065562181e-06, "loss": 0.3583, "step": 16650 }, { "epoch": 0.8413928941188353, "grad_norm": 5.37530591780756, "learning_rate": 1.5865238913021518e-06, "loss": 0.3677, "step": 16660 }, { "epoch": 0.8418979318704073, "grad_norm": 5.851389647033064, "learning_rate": 1.5814728760480857e-06, "loss": 0.3496, "step": 16670 }, { "epoch": 0.8424029696219792, "grad_norm": 12.669284701519452, "learning_rate": 1.5764218607940196e-06, "loss": 0.3537, "step": 16680 }, { "epoch": 0.8429080073735512, "grad_norm": 4.126283598831685, "learning_rate": 1.5713708455399538e-06, "loss": 0.3508, "step": 16690 }, { "epoch": 0.843413045125123, "grad_norm": 5.925620788856901, "learning_rate": 1.5663198302858877e-06, "loss": 0.3575, "step": 16700 }, { "epoch": 0.843918082876695, "grad_norm": 4.664307589807468, "learning_rate": 1.5612688150318214e-06, "loss": 0.3649, "step": 16710 }, { "epoch": 0.844423120628267, "grad_norm": 11.693975175758714, "learning_rate": 1.5562177997777555e-06, "loss": 0.3524, "step": 16720 }, { "epoch": 0.8449281583798389, "grad_norm": 4.379171141956925, "learning_rate": 1.5511667845236894e-06, "loss": 0.3636, "step": 16730 }, { "epoch": 0.8454331961314108, "grad_norm": 4.966223200480838, "learning_rate": 1.5461157692696233e-06, "loss": 0.3754, "step": 16740 }, { "epoch": 0.8459382338829827, "grad_norm": 6.214250837875937, "learning_rate": 1.5410647540155575e-06, "loss": 0.3591, "step": 16750 }, { "epoch": 0.8464432716345547, "grad_norm": 3.0308071228407876, "learning_rate": 1.5360137387614912e-06, "loss": 0.3542, "step": 16760 }, { "epoch": 0.8469483093861266, "grad_norm": 2.6208353352508613, "learning_rate": 1.530962723507425e-06, "loss": 0.3685, "step": 16770 }, { "epoch": 0.8474533471376986, "grad_norm": 6.849704130112083, "learning_rate": 1.525911708253359e-06, "loss": 0.3656, "step": 16780 }, { "epoch": 0.8479583848892704, "grad_norm": 4.032506964164272, "learning_rate": 1.520860692999293e-06, "loss": 0.3432, "step": 16790 }, { "epoch": 0.8484634226408424, "grad_norm": 3.5062245447178464, "learning_rate": 1.515809677745227e-06, "loss": 0.3635, "step": 16800 }, { "epoch": 0.8489684603924144, "grad_norm": 3.098328874271106, "learning_rate": 1.5107586624911607e-06, "loss": 0.3625, "step": 16810 }, { "epoch": 0.8494734981439863, "grad_norm": 4.466142042227375, "learning_rate": 1.5057076472370948e-06, "loss": 0.3613, "step": 16820 }, { "epoch": 0.8499785358955582, "grad_norm": 5.448485215579026, "learning_rate": 1.5006566319830287e-06, "loss": 0.3677, "step": 16830 }, { "epoch": 0.8504835736471301, "grad_norm": 3.7693651118255795, "learning_rate": 1.4956056167289627e-06, "loss": 0.3597, "step": 16840 }, { "epoch": 0.8509886113987021, "grad_norm": 3.0048683020938265, "learning_rate": 1.4905546014748964e-06, "loss": 0.3458, "step": 16850 }, { "epoch": 0.851493649150274, "grad_norm": 3.6083434998949278, "learning_rate": 1.4855035862208305e-06, "loss": 0.3524, "step": 16860 }, { "epoch": 0.8519986869018459, "grad_norm": 3.8301361676049375, "learning_rate": 1.4804525709667644e-06, "loss": 0.354, "step": 16870 }, { "epoch": 0.8525037246534178, "grad_norm": 9.70263976797423, "learning_rate": 1.4754015557126983e-06, "loss": 0.335, "step": 16880 }, { "epoch": 0.8530087624049898, "grad_norm": 2.9237142901845368, "learning_rate": 1.4703505404586324e-06, "loss": 0.3569, "step": 16890 }, { "epoch": 0.8535138001565618, "grad_norm": 9.647980801949249, "learning_rate": 1.4652995252045663e-06, "loss": 0.3514, "step": 16900 }, { "epoch": 0.8540188379081336, "grad_norm": 10.070888822677043, "learning_rate": 1.4602485099505e-06, "loss": 0.3576, "step": 16910 }, { "epoch": 0.8545238756597056, "grad_norm": 3.273605158275998, "learning_rate": 1.4551974946964342e-06, "loss": 0.3707, "step": 16920 }, { "epoch": 0.8550289134112775, "grad_norm": 4.9136034513233575, "learning_rate": 1.450146479442368e-06, "loss": 0.3606, "step": 16930 }, { "epoch": 0.8555339511628495, "grad_norm": 2.4213339644373426, "learning_rate": 1.445095464188302e-06, "loss": 0.3511, "step": 16940 }, { "epoch": 0.8560389889144213, "grad_norm": 5.947198902514986, "learning_rate": 1.4400444489342357e-06, "loss": 0.3441, "step": 16950 }, { "epoch": 0.8565440266659933, "grad_norm": 4.128142354654801, "learning_rate": 1.4349934336801698e-06, "loss": 0.3421, "step": 16960 }, { "epoch": 0.8570490644175652, "grad_norm": 3.699537501488419, "learning_rate": 1.4299424184261037e-06, "loss": 0.3517, "step": 16970 }, { "epoch": 0.8575541021691372, "grad_norm": 2.4855687780449074, "learning_rate": 1.4248914031720376e-06, "loss": 0.3571, "step": 16980 }, { "epoch": 0.858059139920709, "grad_norm": 4.059592927833855, "learning_rate": 1.4198403879179718e-06, "loss": 0.3515, "step": 16990 }, { "epoch": 0.858564177672281, "grad_norm": 10.362480051840812, "learning_rate": 1.4147893726639057e-06, "loss": 0.3599, "step": 17000 }, { "epoch": 0.859069215423853, "grad_norm": 4.66559692240262, "learning_rate": 1.4097383574098394e-06, "loss": 0.3659, "step": 17010 }, { "epoch": 0.8595742531754249, "grad_norm": 3.0262644076926244, "learning_rate": 1.4046873421557733e-06, "loss": 0.3627, "step": 17020 }, { "epoch": 0.8600792909269968, "grad_norm": 6.4805418875740335, "learning_rate": 1.3996363269017074e-06, "loss": 0.3552, "step": 17030 }, { "epoch": 0.8605843286785687, "grad_norm": 6.48974005858549, "learning_rate": 1.3945853116476413e-06, "loss": 0.3742, "step": 17040 }, { "epoch": 0.8610893664301407, "grad_norm": 12.933277403296861, "learning_rate": 1.389534296393575e-06, "loss": 0.3688, "step": 17050 }, { "epoch": 0.8615944041817126, "grad_norm": 5.0135946224215076, "learning_rate": 1.3844832811395094e-06, "loss": 0.3495, "step": 17060 }, { "epoch": 0.8620994419332845, "grad_norm": 4.664374173808275, "learning_rate": 1.379432265885443e-06, "loss": 0.3479, "step": 17070 }, { "epoch": 0.8626044796848564, "grad_norm": 3.413827294110899, "learning_rate": 1.374381250631377e-06, "loss": 0.363, "step": 17080 }, { "epoch": 0.8631095174364284, "grad_norm": 4.110151494898355, "learning_rate": 1.369330235377311e-06, "loss": 0.3601, "step": 17090 }, { "epoch": 0.8636145551880003, "grad_norm": 4.057869159667351, "learning_rate": 1.364279220123245e-06, "loss": 0.3524, "step": 17100 }, { "epoch": 0.8641195929395722, "grad_norm": 3.345052422207178, "learning_rate": 1.3592282048691787e-06, "loss": 0.3464, "step": 17110 }, { "epoch": 0.8646246306911441, "grad_norm": 17.60499598627659, "learning_rate": 1.3541771896151126e-06, "loss": 0.3457, "step": 17120 }, { "epoch": 0.8651296684427161, "grad_norm": 3.1318074334341395, "learning_rate": 1.3491261743610467e-06, "loss": 0.3519, "step": 17130 }, { "epoch": 0.8656347061942881, "grad_norm": 8.924077465063698, "learning_rate": 1.3440751591069806e-06, "loss": 0.3536, "step": 17140 }, { "epoch": 0.8661397439458599, "grad_norm": 5.697646524473763, "learning_rate": 1.3390241438529146e-06, "loss": 0.3528, "step": 17150 }, { "epoch": 0.8666447816974319, "grad_norm": 2.759497306215723, "learning_rate": 1.3339731285988487e-06, "loss": 0.3619, "step": 17160 }, { "epoch": 0.8671498194490038, "grad_norm": 7.1692765874676905, "learning_rate": 1.3289221133447824e-06, "loss": 0.3719, "step": 17170 }, { "epoch": 0.8676548572005758, "grad_norm": 7.494321248279156, "learning_rate": 1.3238710980907163e-06, "loss": 0.345, "step": 17180 }, { "epoch": 0.8681598949521476, "grad_norm": 3.809737865656533, "learning_rate": 1.3188200828366502e-06, "loss": 0.3482, "step": 17190 }, { "epoch": 0.8686649327037196, "grad_norm": 2.566570715695477, "learning_rate": 1.3137690675825843e-06, "loss": 0.3606, "step": 17200 }, { "epoch": 0.8691699704552915, "grad_norm": 5.642929958428042, "learning_rate": 1.308718052328518e-06, "loss": 0.3563, "step": 17210 }, { "epoch": 0.8696750082068635, "grad_norm": 7.32713592653821, "learning_rate": 1.303667037074452e-06, "loss": 0.3546, "step": 17220 }, { "epoch": 0.8701800459584353, "grad_norm": 3.958850560726077, "learning_rate": 1.298616021820386e-06, "loss": 0.3455, "step": 17230 }, { "epoch": 0.8706850837100073, "grad_norm": 6.811151879320974, "learning_rate": 1.29356500656632e-06, "loss": 0.3549, "step": 17240 }, { "epoch": 0.8711901214615793, "grad_norm": 7.69433841766783, "learning_rate": 1.2885139913122539e-06, "loss": 0.3521, "step": 17250 }, { "epoch": 0.8716951592131512, "grad_norm": 2.703944056955305, "learning_rate": 1.283462976058188e-06, "loss": 0.364, "step": 17260 }, { "epoch": 0.8722001969647231, "grad_norm": 2.6933087195097296, "learning_rate": 1.2784119608041217e-06, "loss": 0.3611, "step": 17270 }, { "epoch": 0.872705234716295, "grad_norm": 5.320181701389919, "learning_rate": 1.2733609455500556e-06, "loss": 0.3645, "step": 17280 }, { "epoch": 0.873210272467867, "grad_norm": 3.361877630451869, "learning_rate": 1.2683099302959895e-06, "loss": 0.3519, "step": 17290 }, { "epoch": 0.8737153102194389, "grad_norm": 2.951104064745123, "learning_rate": 1.2632589150419237e-06, "loss": 0.3587, "step": 17300 }, { "epoch": 0.8742203479710108, "grad_norm": 6.829151598791138, "learning_rate": 1.2582078997878574e-06, "loss": 0.3449, "step": 17310 }, { "epoch": 0.8747253857225827, "grad_norm": 2.350852229769939, "learning_rate": 1.2531568845337913e-06, "loss": 0.3478, "step": 17320 }, { "epoch": 0.8752304234741547, "grad_norm": 3.348683817462015, "learning_rate": 1.2481058692797254e-06, "loss": 0.3542, "step": 17330 }, { "epoch": 0.8757354612257267, "grad_norm": 13.330436598568463, "learning_rate": 1.2430548540256593e-06, "loss": 0.3606, "step": 17340 }, { "epoch": 0.8762404989772985, "grad_norm": 2.9324631629214233, "learning_rate": 1.2380038387715932e-06, "loss": 0.3403, "step": 17350 }, { "epoch": 0.8767455367288705, "grad_norm": 12.462910905007668, "learning_rate": 1.2329528235175271e-06, "loss": 0.36, "step": 17360 }, { "epoch": 0.8772505744804424, "grad_norm": 6.774737202335634, "learning_rate": 1.227901808263461e-06, "loss": 0.3529, "step": 17370 }, { "epoch": 0.8777556122320144, "grad_norm": 6.599803354341647, "learning_rate": 1.222850793009395e-06, "loss": 0.349, "step": 17380 }, { "epoch": 0.8782606499835863, "grad_norm": 4.087712138749244, "learning_rate": 1.2177997777553289e-06, "loss": 0.3585, "step": 17390 }, { "epoch": 0.8787656877351582, "grad_norm": 5.90307508489925, "learning_rate": 1.2127487625012628e-06, "loss": 0.3779, "step": 17400 }, { "epoch": 0.8792707254867301, "grad_norm": 2.9671035952158404, "learning_rate": 1.2076977472471967e-06, "loss": 0.3581, "step": 17410 }, { "epoch": 0.8797757632383021, "grad_norm": 3.797739053184357, "learning_rate": 1.2026467319931308e-06, "loss": 0.3523, "step": 17420 }, { "epoch": 0.880280800989874, "grad_norm": 2.753560543414613, "learning_rate": 1.1975957167390647e-06, "loss": 0.3525, "step": 17430 }, { "epoch": 0.8807858387414459, "grad_norm": 9.720571030998466, "learning_rate": 1.1925447014849986e-06, "loss": 0.3541, "step": 17440 }, { "epoch": 0.8812908764930178, "grad_norm": 3.0251393418296226, "learning_rate": 1.1874936862309326e-06, "loss": 0.346, "step": 17450 }, { "epoch": 0.8817959142445898, "grad_norm": 3.841487482571985, "learning_rate": 1.1824426709768665e-06, "loss": 0.3667, "step": 17460 }, { "epoch": 0.8823009519961618, "grad_norm": 3.3891055128420655, "learning_rate": 1.1773916557228004e-06, "loss": 0.34, "step": 17470 }, { "epoch": 0.8828059897477336, "grad_norm": 6.131210762114432, "learning_rate": 1.1723406404687343e-06, "loss": 0.3631, "step": 17480 }, { "epoch": 0.8833110274993056, "grad_norm": 3.2369764099154117, "learning_rate": 1.1672896252146682e-06, "loss": 0.3588, "step": 17490 }, { "epoch": 0.8838160652508775, "grad_norm": 5.076564547768568, "learning_rate": 1.1622386099606021e-06, "loss": 0.3556, "step": 17500 }, { "epoch": 0.8843211030024495, "grad_norm": 4.679877306058496, "learning_rate": 1.1571875947065362e-06, "loss": 0.344, "step": 17510 }, { "epoch": 0.8848261407540213, "grad_norm": 11.330975476714967, "learning_rate": 1.15213657945247e-06, "loss": 0.3602, "step": 17520 }, { "epoch": 0.8853311785055933, "grad_norm": 3.0144585760929576, "learning_rate": 1.147085564198404e-06, "loss": 0.3489, "step": 17530 }, { "epoch": 0.8858362162571652, "grad_norm": 6.287423740993079, "learning_rate": 1.142034548944338e-06, "loss": 0.3558, "step": 17540 }, { "epoch": 0.8863412540087372, "grad_norm": 3.489576000710248, "learning_rate": 1.1369835336902719e-06, "loss": 0.3531, "step": 17550 }, { "epoch": 0.886846291760309, "grad_norm": 3.733608437865078, "learning_rate": 1.1319325184362058e-06, "loss": 0.375, "step": 17560 }, { "epoch": 0.887351329511881, "grad_norm": 5.217817871919478, "learning_rate": 1.1268815031821397e-06, "loss": 0.3542, "step": 17570 }, { "epoch": 0.887856367263453, "grad_norm": 2.799146326616282, "learning_rate": 1.1218304879280736e-06, "loss": 0.3483, "step": 17580 }, { "epoch": 0.8883614050150249, "grad_norm": 2.6219946573680275, "learning_rate": 1.1167794726740075e-06, "loss": 0.3553, "step": 17590 }, { "epoch": 0.8888664427665968, "grad_norm": 3.3194236691159764, "learning_rate": 1.1117284574199414e-06, "loss": 0.3714, "step": 17600 }, { "epoch": 0.8893714805181687, "grad_norm": 7.465096104263228, "learning_rate": 1.1066774421658756e-06, "loss": 0.358, "step": 17610 }, { "epoch": 0.8898765182697407, "grad_norm": 7.584088341048675, "learning_rate": 1.1016264269118093e-06, "loss": 0.3474, "step": 17620 }, { "epoch": 0.8903815560213126, "grad_norm": 4.658333080329891, "learning_rate": 1.0965754116577434e-06, "loss": 0.3491, "step": 17630 }, { "epoch": 0.8908865937728845, "grad_norm": 3.8001771174540377, "learning_rate": 1.091524396403677e-06, "loss": 0.354, "step": 17640 }, { "epoch": 0.8913916315244564, "grad_norm": 2.5239942090163834, "learning_rate": 1.0864733811496112e-06, "loss": 0.3526, "step": 17650 }, { "epoch": 0.8918966692760284, "grad_norm": 3.192080136278505, "learning_rate": 1.0814223658955451e-06, "loss": 0.3674, "step": 17660 }, { "epoch": 0.8924017070276004, "grad_norm": 5.19946883439612, "learning_rate": 1.076371350641479e-06, "loss": 0.3575, "step": 17670 }, { "epoch": 0.8929067447791722, "grad_norm": 4.214472251420109, "learning_rate": 1.071320335387413e-06, "loss": 0.3389, "step": 17680 }, { "epoch": 0.8934117825307442, "grad_norm": 10.692110937064982, "learning_rate": 1.0662693201333469e-06, "loss": 0.3731, "step": 17690 }, { "epoch": 0.8939168202823161, "grad_norm": 6.139646910075302, "learning_rate": 1.0612183048792808e-06, "loss": 0.3472, "step": 17700 }, { "epoch": 0.8944218580338881, "grad_norm": 3.5048760100552157, "learning_rate": 1.056167289625215e-06, "loss": 0.3479, "step": 17710 }, { "epoch": 0.8949268957854599, "grad_norm": 3.4931757105634254, "learning_rate": 1.0511162743711486e-06, "loss": 0.3546, "step": 17720 }, { "epoch": 0.8954319335370319, "grad_norm": 7.742107901473751, "learning_rate": 1.0460652591170827e-06, "loss": 0.3501, "step": 17730 }, { "epoch": 0.8959369712886038, "grad_norm": 2.8474052373770107, "learning_rate": 1.0410142438630164e-06, "loss": 0.3509, "step": 17740 }, { "epoch": 0.8964420090401758, "grad_norm": 3.2173387588789186, "learning_rate": 1.0359632286089505e-06, "loss": 0.3513, "step": 17750 }, { "epoch": 0.8969470467917476, "grad_norm": 6.294405766765221, "learning_rate": 1.0309122133548845e-06, "loss": 0.3566, "step": 17760 }, { "epoch": 0.8974520845433196, "grad_norm": 2.4812196184280344, "learning_rate": 1.0258611981008184e-06, "loss": 0.3517, "step": 17770 }, { "epoch": 0.8979571222948916, "grad_norm": 34.26237777325275, "learning_rate": 1.0208101828467523e-06, "loss": 0.3448, "step": 17780 }, { "epoch": 0.8984621600464635, "grad_norm": 62.559391948683924, "learning_rate": 1.0157591675926862e-06, "loss": 0.361, "step": 17790 }, { "epoch": 0.8989671977980354, "grad_norm": 3.0126489507596106, "learning_rate": 1.01070815233862e-06, "loss": 0.3594, "step": 17800 }, { "epoch": 0.8994722355496073, "grad_norm": 4.4144846462481935, "learning_rate": 1.0056571370845542e-06, "loss": 0.3569, "step": 17810 }, { "epoch": 0.8999772733011793, "grad_norm": 6.230988179874581, "learning_rate": 1.000606121830488e-06, "loss": 0.3383, "step": 17820 }, { "epoch": 0.9004823110527512, "grad_norm": 2.294744478892904, "learning_rate": 9.95555106576422e-07, "loss": 0.3443, "step": 17830 }, { "epoch": 0.9009873488043231, "grad_norm": 5.478102588014345, "learning_rate": 9.905040913223558e-07, "loss": 0.3469, "step": 17840 }, { "epoch": 0.901492386555895, "grad_norm": 2.756012421267679, "learning_rate": 9.854530760682899e-07, "loss": 0.3629, "step": 17850 }, { "epoch": 0.901997424307467, "grad_norm": 2.9051333748854504, "learning_rate": 9.804020608142236e-07, "loss": 0.3493, "step": 17860 }, { "epoch": 0.902502462059039, "grad_norm": 2.942294785390223, "learning_rate": 9.753510455601577e-07, "loss": 0.3497, "step": 17870 }, { "epoch": 0.9030074998106108, "grad_norm": 3.2373052706846135, "learning_rate": 9.703000303060916e-07, "loss": 0.3423, "step": 17880 }, { "epoch": 0.9035125375621827, "grad_norm": 3.2500158251075266, "learning_rate": 9.652490150520255e-07, "loss": 0.3539, "step": 17890 }, { "epoch": 0.9040175753137547, "grad_norm": 3.5580015551516104, "learning_rate": 9.601979997979594e-07, "loss": 0.3332, "step": 17900 }, { "epoch": 0.9045226130653267, "grad_norm": 2.9893729880240203, "learning_rate": 9.551469845438933e-07, "loss": 0.3421, "step": 17910 }, { "epoch": 0.9050276508168985, "grad_norm": 3.1028575176827182, "learning_rate": 9.500959692898274e-07, "loss": 0.3408, "step": 17920 }, { "epoch": 0.9055326885684705, "grad_norm": 3.018466429936209, "learning_rate": 9.450449540357613e-07, "loss": 0.3444, "step": 17930 }, { "epoch": 0.9060377263200424, "grad_norm": 5.330235007532234, "learning_rate": 9.399939387816952e-07, "loss": 0.3635, "step": 17940 }, { "epoch": 0.9065427640716144, "grad_norm": 8.530098317004866, "learning_rate": 9.349429235276292e-07, "loss": 0.3499, "step": 17950 }, { "epoch": 0.9070478018231862, "grad_norm": 5.312690518311376, "learning_rate": 9.29891908273563e-07, "loss": 0.3542, "step": 17960 }, { "epoch": 0.9075528395747582, "grad_norm": 4.197543188110089, "learning_rate": 9.24840893019497e-07, "loss": 0.3545, "step": 17970 }, { "epoch": 0.9080578773263301, "grad_norm": 8.064198853556876, "learning_rate": 9.197898777654309e-07, "loss": 0.3642, "step": 17980 }, { "epoch": 0.9085629150779021, "grad_norm": 2.6318426760910656, "learning_rate": 9.147388625113649e-07, "loss": 0.3671, "step": 17990 }, { "epoch": 0.9090679528294741, "grad_norm": 5.88110776094459, "learning_rate": 9.096878472572989e-07, "loss": 0.3278, "step": 18000 }, { "epoch": 0.9095729905810459, "grad_norm": 7.327649689814793, "learning_rate": 9.046368320032327e-07, "loss": 0.3612, "step": 18010 }, { "epoch": 0.9100780283326179, "grad_norm": 5.1865153794476235, "learning_rate": 8.995858167491667e-07, "loss": 0.3382, "step": 18020 }, { "epoch": 0.9105830660841898, "grad_norm": 3.0447013025219696, "learning_rate": 8.945348014951005e-07, "loss": 0.3543, "step": 18030 }, { "epoch": 0.9110881038357618, "grad_norm": 3.6031825002335753, "learning_rate": 8.894837862410345e-07, "loss": 0.3447, "step": 18040 }, { "epoch": 0.9115931415873336, "grad_norm": 3.724192694212046, "learning_rate": 8.844327709869685e-07, "loss": 0.3435, "step": 18050 }, { "epoch": 0.9120981793389056, "grad_norm": 3.9945852787473015, "learning_rate": 8.793817557329023e-07, "loss": 0.3618, "step": 18060 }, { "epoch": 0.9126032170904775, "grad_norm": 4.943722371461341, "learning_rate": 8.743307404788364e-07, "loss": 0.3546, "step": 18070 }, { "epoch": 0.9131082548420495, "grad_norm": 3.3003892909027313, "learning_rate": 8.692797252247702e-07, "loss": 0.3341, "step": 18080 }, { "epoch": 0.9136132925936213, "grad_norm": 2.5050619983833826, "learning_rate": 8.642287099707042e-07, "loss": 0.3434, "step": 18090 }, { "epoch": 0.9141183303451933, "grad_norm": 3.7372168096985408, "learning_rate": 8.591776947166382e-07, "loss": 0.3456, "step": 18100 }, { "epoch": 0.9146233680967653, "grad_norm": 2.587657901087928, "learning_rate": 8.54126679462572e-07, "loss": 0.3442, "step": 18110 }, { "epoch": 0.9151284058483372, "grad_norm": 2.9656578110467207, "learning_rate": 8.49075664208506e-07, "loss": 0.3588, "step": 18120 }, { "epoch": 0.9156334435999091, "grad_norm": 3.3573402419593186, "learning_rate": 8.440246489544398e-07, "loss": 0.3277, "step": 18130 }, { "epoch": 0.916138481351481, "grad_norm": 2.7211678821317205, "learning_rate": 8.389736337003738e-07, "loss": 0.3531, "step": 18140 }, { "epoch": 0.916643519103053, "grad_norm": 5.733816461438319, "learning_rate": 8.339226184463079e-07, "loss": 0.3554, "step": 18150 }, { "epoch": 0.9171485568546249, "grad_norm": 2.9296826900284536, "learning_rate": 8.288716031922417e-07, "loss": 0.3484, "step": 18160 }, { "epoch": 0.9176535946061968, "grad_norm": 2.660140218702211, "learning_rate": 8.238205879381757e-07, "loss": 0.3456, "step": 18170 }, { "epoch": 0.9181586323577687, "grad_norm": 2.5446985764311996, "learning_rate": 8.187695726841095e-07, "loss": 0.3609, "step": 18180 }, { "epoch": 0.9186636701093407, "grad_norm": 4.659560104675861, "learning_rate": 8.137185574300435e-07, "loss": 0.3438, "step": 18190 }, { "epoch": 0.9191687078609126, "grad_norm": 2.727989773393059, "learning_rate": 8.086675421759773e-07, "loss": 0.3573, "step": 18200 }, { "epoch": 0.9196737456124845, "grad_norm": 2.415446839777949, "learning_rate": 8.036165269219113e-07, "loss": 0.353, "step": 18210 }, { "epoch": 0.9201787833640565, "grad_norm": 4.842714491783439, "learning_rate": 7.985655116678454e-07, "loss": 0.3616, "step": 18220 }, { "epoch": 0.9206838211156284, "grad_norm": 4.320254893741863, "learning_rate": 7.935144964137792e-07, "loss": 0.3441, "step": 18230 }, { "epoch": 0.9211888588672004, "grad_norm": 7.676940904287274, "learning_rate": 7.884634811597132e-07, "loss": 0.3469, "step": 18240 }, { "epoch": 0.9216938966187722, "grad_norm": 4.236663818861765, "learning_rate": 7.834124659056471e-07, "loss": 0.3415, "step": 18250 }, { "epoch": 0.9221989343703442, "grad_norm": 2.5909282600390977, "learning_rate": 7.78361450651581e-07, "loss": 0.3735, "step": 18260 }, { "epoch": 0.9227039721219161, "grad_norm": 3.095824958631971, "learning_rate": 7.73310435397515e-07, "loss": 0.3448, "step": 18270 }, { "epoch": 0.9232090098734881, "grad_norm": 2.915058703898021, "learning_rate": 7.682594201434488e-07, "loss": 0.3605, "step": 18280 }, { "epoch": 0.9237140476250599, "grad_norm": 4.0628438496453985, "learning_rate": 7.632084048893828e-07, "loss": 0.3522, "step": 18290 }, { "epoch": 0.9242190853766319, "grad_norm": 5.339121668100427, "learning_rate": 7.581573896353168e-07, "loss": 0.348, "step": 18300 }, { "epoch": 0.9247241231282038, "grad_norm": 2.9403014748581366, "learning_rate": 7.531063743812507e-07, "loss": 0.3436, "step": 18310 }, { "epoch": 0.9252291608797758, "grad_norm": 4.4297759422952065, "learning_rate": 7.480553591271847e-07, "loss": 0.3688, "step": 18320 }, { "epoch": 0.9257341986313476, "grad_norm": 8.718344498904465, "learning_rate": 7.430043438731185e-07, "loss": 0.3493, "step": 18330 }, { "epoch": 0.9262392363829196, "grad_norm": 17.16627380229125, "learning_rate": 7.379533286190525e-07, "loss": 0.3635, "step": 18340 }, { "epoch": 0.9267442741344916, "grad_norm": 6.9347616228201785, "learning_rate": 7.329023133649864e-07, "loss": 0.3443, "step": 18350 }, { "epoch": 0.9272493118860635, "grad_norm": 4.584801866934316, "learning_rate": 7.278512981109203e-07, "loss": 0.3381, "step": 18360 }, { "epoch": 0.9277543496376354, "grad_norm": 6.526194404732607, "learning_rate": 7.228002828568542e-07, "loss": 0.3795, "step": 18370 }, { "epoch": 0.9282593873892073, "grad_norm": 2.8075224191637536, "learning_rate": 7.177492676027882e-07, "loss": 0.3473, "step": 18380 }, { "epoch": 0.9287644251407793, "grad_norm": 8.151912813354414, "learning_rate": 7.126982523487222e-07, "loss": 0.3358, "step": 18390 }, { "epoch": 0.9292694628923512, "grad_norm": 4.7898553816163245, "learning_rate": 7.076472370946561e-07, "loss": 0.3662, "step": 18400 }, { "epoch": 0.9297745006439231, "grad_norm": 4.443345676540152, "learning_rate": 7.0259622184059e-07, "loss": 0.3576, "step": 18410 }, { "epoch": 0.930279538395495, "grad_norm": 2.1359533523104464, "learning_rate": 6.975452065865239e-07, "loss": 0.3469, "step": 18420 }, { "epoch": 0.930784576147067, "grad_norm": 37.17325982324839, "learning_rate": 6.924941913324579e-07, "loss": 0.3581, "step": 18430 }, { "epoch": 0.931289613898639, "grad_norm": 9.673960641714096, "learning_rate": 6.874431760783918e-07, "loss": 0.3522, "step": 18440 }, { "epoch": 0.9317946516502108, "grad_norm": 3.176528472152659, "learning_rate": 6.823921608243258e-07, "loss": 0.3464, "step": 18450 }, { "epoch": 0.9322996894017828, "grad_norm": 6.300715383126615, "learning_rate": 6.773411455702597e-07, "loss": 0.3497, "step": 18460 }, { "epoch": 0.9328047271533547, "grad_norm": 8.137473676823792, "learning_rate": 6.722901303161936e-07, "loss": 0.3542, "step": 18470 }, { "epoch": 0.9333097649049267, "grad_norm": 3.7878098971135756, "learning_rate": 6.672391150621276e-07, "loss": 0.331, "step": 18480 }, { "epoch": 0.9338148026564985, "grad_norm": 3.4537119018992883, "learning_rate": 6.621880998080615e-07, "loss": 0.349, "step": 18490 }, { "epoch": 0.9343198404080705, "grad_norm": 5.7332136754175425, "learning_rate": 6.571370845539954e-07, "loss": 0.3686, "step": 18500 }, { "epoch": 0.9348248781596424, "grad_norm": 2.40088922999737, "learning_rate": 6.520860692999293e-07, "loss": 0.3283, "step": 18510 }, { "epoch": 0.9353299159112144, "grad_norm": 3.363962036008761, "learning_rate": 6.470350540458632e-07, "loss": 0.3438, "step": 18520 }, { "epoch": 0.9358349536627862, "grad_norm": 4.168660835516454, "learning_rate": 6.419840387917973e-07, "loss": 0.3454, "step": 18530 }, { "epoch": 0.9363399914143582, "grad_norm": 31.57273809211181, "learning_rate": 6.369330235377312e-07, "loss": 0.3525, "step": 18540 }, { "epoch": 0.9368450291659302, "grad_norm": 2.6132048866458453, "learning_rate": 6.318820082836651e-07, "loss": 0.3445, "step": 18550 }, { "epoch": 0.9373500669175021, "grad_norm": 3.121982815268045, "learning_rate": 6.26830993029599e-07, "loss": 0.3548, "step": 18560 }, { "epoch": 0.937855104669074, "grad_norm": 3.878175783357159, "learning_rate": 6.21779977775533e-07, "loss": 0.3551, "step": 18570 }, { "epoch": 0.9383601424206459, "grad_norm": 4.9399268768023035, "learning_rate": 6.167289625214669e-07, "loss": 0.3406, "step": 18580 }, { "epoch": 0.9388651801722179, "grad_norm": 4.102118329658351, "learning_rate": 6.116779472674008e-07, "loss": 0.3415, "step": 18590 }, { "epoch": 0.9393702179237898, "grad_norm": 7.576480207566005, "learning_rate": 6.066269320133347e-07, "loss": 0.3474, "step": 18600 }, { "epoch": 0.9398752556753618, "grad_norm": 5.205648741417787, "learning_rate": 6.015759167592687e-07, "loss": 0.3701, "step": 18610 }, { "epoch": 0.9403802934269336, "grad_norm": 2.574703645382877, "learning_rate": 5.965249015052026e-07, "loss": 0.3711, "step": 18620 }, { "epoch": 0.9408853311785056, "grad_norm": 2.1710201302013608, "learning_rate": 5.914738862511366e-07, "loss": 0.3474, "step": 18630 }, { "epoch": 0.9413903689300775, "grad_norm": 4.238488989950128, "learning_rate": 5.864228709970705e-07, "loss": 0.3458, "step": 18640 }, { "epoch": 0.9418954066816495, "grad_norm": 3.9298261035180815, "learning_rate": 5.813718557430044e-07, "loss": 0.3517, "step": 18650 }, { "epoch": 0.9424004444332214, "grad_norm": 4.183286579494331, "learning_rate": 5.763208404889383e-07, "loss": 0.3535, "step": 18660 }, { "epoch": 0.9429054821847933, "grad_norm": 3.787432945522304, "learning_rate": 5.712698252348722e-07, "loss": 0.3438, "step": 18670 }, { "epoch": 0.9434105199363653, "grad_norm": 42.44616395667764, "learning_rate": 5.662188099808063e-07, "loss": 0.3487, "step": 18680 }, { "epoch": 0.9439155576879372, "grad_norm": 2.467350994396614, "learning_rate": 5.611677947267402e-07, "loss": 0.3495, "step": 18690 }, { "epoch": 0.9444205954395091, "grad_norm": 3.4763055801021197, "learning_rate": 5.561167794726741e-07, "loss": 0.3471, "step": 18700 }, { "epoch": 0.944925633191081, "grad_norm": 2.5579533898925604, "learning_rate": 5.51065764218608e-07, "loss": 0.3534, "step": 18710 }, { "epoch": 0.945430670942653, "grad_norm": 4.548010779885983, "learning_rate": 5.460147489645419e-07, "loss": 0.3471, "step": 18720 }, { "epoch": 0.9459357086942249, "grad_norm": 6.469724337112102, "learning_rate": 5.409637337104758e-07, "loss": 0.3519, "step": 18730 }, { "epoch": 0.9464407464457968, "grad_norm": 3.2909861731073007, "learning_rate": 5.359127184564098e-07, "loss": 0.3455, "step": 18740 }, { "epoch": 0.9469457841973687, "grad_norm": 3.9954669422093674, "learning_rate": 5.308617032023437e-07, "loss": 0.3484, "step": 18750 }, { "epoch": 0.9474508219489407, "grad_norm": 4.918272865216922, "learning_rate": 5.258106879482777e-07, "loss": 0.3543, "step": 18760 }, { "epoch": 0.9479558597005127, "grad_norm": 7.001698550237223, "learning_rate": 5.207596726942116e-07, "loss": 0.35, "step": 18770 }, { "epoch": 0.9484608974520845, "grad_norm": 5.052929072371654, "learning_rate": 5.157086574401455e-07, "loss": 0.3442, "step": 18780 }, { "epoch": 0.9489659352036565, "grad_norm": 4.671072953558266, "learning_rate": 5.106576421860794e-07, "loss": 0.3413, "step": 18790 }, { "epoch": 0.9494709729552284, "grad_norm": 2.2274459931275943, "learning_rate": 5.056066269320134e-07, "loss": 0.3579, "step": 18800 }, { "epoch": 0.9499760107068004, "grad_norm": 3.679235828488556, "learning_rate": 5.005556116779473e-07, "loss": 0.3482, "step": 18810 }, { "epoch": 0.9504810484583722, "grad_norm": 5.113998461225806, "learning_rate": 4.955045964238812e-07, "loss": 0.3492, "step": 18820 }, { "epoch": 0.9509860862099442, "grad_norm": 2.313685561005164, "learning_rate": 4.904535811698151e-07, "loss": 0.3498, "step": 18830 }, { "epoch": 0.9514911239615161, "grad_norm": 4.548823482232416, "learning_rate": 4.854025659157491e-07, "loss": 0.3381, "step": 18840 }, { "epoch": 0.9519961617130881, "grad_norm": 5.44596462224921, "learning_rate": 4.803515506616831e-07, "loss": 0.3499, "step": 18850 }, { "epoch": 0.9525011994646599, "grad_norm": 5.653637436781184, "learning_rate": 4.75300535407617e-07, "loss": 0.3606, "step": 18860 }, { "epoch": 0.9530062372162319, "grad_norm": 17.20756057482154, "learning_rate": 4.702495201535509e-07, "loss": 0.3513, "step": 18870 }, { "epoch": 0.9535112749678039, "grad_norm": 2.6415906411811094, "learning_rate": 4.651985048994848e-07, "loss": 0.3584, "step": 18880 }, { "epoch": 0.9540163127193758, "grad_norm": 3.3045378472333926, "learning_rate": 4.601474896454188e-07, "loss": 0.3672, "step": 18890 }, { "epoch": 0.9545213504709477, "grad_norm": 2.494761338609364, "learning_rate": 4.550964743913527e-07, "loss": 0.3479, "step": 18900 }, { "epoch": 0.9550263882225196, "grad_norm": 3.7846428264832346, "learning_rate": 4.5004545913728665e-07, "loss": 0.3594, "step": 18910 }, { "epoch": 0.9555314259740916, "grad_norm": 11.41402543242729, "learning_rate": 4.4499444388322056e-07, "loss": 0.3391, "step": 18920 }, { "epoch": 0.9560364637256635, "grad_norm": 23.109764846483394, "learning_rate": 4.399434286291545e-07, "loss": 0.3657, "step": 18930 }, { "epoch": 0.9565415014772354, "grad_norm": 2.8683015901455313, "learning_rate": 4.3489241337508844e-07, "loss": 0.365, "step": 18940 }, { "epoch": 0.9570465392288073, "grad_norm": 3.310696390265973, "learning_rate": 4.2984139812102235e-07, "loss": 0.3692, "step": 18950 }, { "epoch": 0.9575515769803793, "grad_norm": 3.223732467814084, "learning_rate": 4.247903828669563e-07, "loss": 0.3268, "step": 18960 }, { "epoch": 0.9580566147319513, "grad_norm": 3.067086167535032, "learning_rate": 4.1973936761289023e-07, "loss": 0.353, "step": 18970 }, { "epoch": 0.9585616524835231, "grad_norm": 4.735815421841188, "learning_rate": 4.1468835235882414e-07, "loss": 0.3444, "step": 18980 }, { "epoch": 0.959066690235095, "grad_norm": 3.425925550819413, "learning_rate": 4.096373371047581e-07, "loss": 0.3385, "step": 18990 }, { "epoch": 0.959571727986667, "grad_norm": 4.169456686292674, "learning_rate": 4.04586321850692e-07, "loss": 0.3635, "step": 19000 }, { "epoch": 0.960076765738239, "grad_norm": 2.7883327435971967, "learning_rate": 3.9953530659662593e-07, "loss": 0.3496, "step": 19010 }, { "epoch": 0.9605818034898108, "grad_norm": 6.631719541255136, "learning_rate": 3.944842913425599e-07, "loss": 0.3536, "step": 19020 }, { "epoch": 0.9610868412413828, "grad_norm": 6.199402518918171, "learning_rate": 3.8943327608849386e-07, "loss": 0.3492, "step": 19030 }, { "epoch": 0.9615918789929547, "grad_norm": 3.832122639682839, "learning_rate": 3.8438226083442777e-07, "loss": 0.3559, "step": 19040 }, { "epoch": 0.9620969167445267, "grad_norm": 5.054933419698706, "learning_rate": 3.793312455803617e-07, "loss": 0.3396, "step": 19050 }, { "epoch": 0.9626019544960985, "grad_norm": 4.931557429835319, "learning_rate": 3.742802303262956e-07, "loss": 0.361, "step": 19060 }, { "epoch": 0.9631069922476705, "grad_norm": 2.8243354412545663, "learning_rate": 3.692292150722295e-07, "loss": 0.3519, "step": 19070 }, { "epoch": 0.9636120299992424, "grad_norm": 2.9047769826763843, "learning_rate": 3.641781998181635e-07, "loss": 0.352, "step": 19080 }, { "epoch": 0.9641170677508144, "grad_norm": 3.5750399190488586, "learning_rate": 3.5912718456409744e-07, "loss": 0.3515, "step": 19090 }, { "epoch": 0.9646221055023863, "grad_norm": 4.083793717766302, "learning_rate": 3.5407616931003135e-07, "loss": 0.3364, "step": 19100 }, { "epoch": 0.9651271432539582, "grad_norm": 2.237959887146547, "learning_rate": 3.4902515405596526e-07, "loss": 0.3451, "step": 19110 }, { "epoch": 0.9656321810055302, "grad_norm": 3.1768900583422597, "learning_rate": 3.4397413880189917e-07, "loss": 0.3468, "step": 19120 }, { "epoch": 0.9661372187571021, "grad_norm": 4.21068826694376, "learning_rate": 3.389231235478332e-07, "loss": 0.3412, "step": 19130 }, { "epoch": 0.966642256508674, "grad_norm": 5.657422019205803, "learning_rate": 3.338721082937671e-07, "loss": 0.3439, "step": 19140 }, { "epoch": 0.9671472942602459, "grad_norm": 6.306482550657807, "learning_rate": 3.28821093039701e-07, "loss": 0.3668, "step": 19150 }, { "epoch": 0.9676523320118179, "grad_norm": 3.949528139283546, "learning_rate": 3.237700777856349e-07, "loss": 0.3521, "step": 19160 }, { "epoch": 0.9681573697633898, "grad_norm": 3.8577102555565457, "learning_rate": 3.1871906253156884e-07, "loss": 0.3432, "step": 19170 }, { "epoch": 0.9686624075149618, "grad_norm": 10.01075740910154, "learning_rate": 3.1366804727750275e-07, "loss": 0.349, "step": 19180 }, { "epoch": 0.9691674452665336, "grad_norm": 5.366835858110643, "learning_rate": 3.086170320234367e-07, "loss": 0.3347, "step": 19190 }, { "epoch": 0.9696724830181056, "grad_norm": 3.0600066927692553, "learning_rate": 3.035660167693707e-07, "loss": 0.3484, "step": 19200 }, { "epoch": 0.9701775207696776, "grad_norm": 3.3199457968898933, "learning_rate": 2.985150015153046e-07, "loss": 0.3662, "step": 19210 }, { "epoch": 0.9706825585212495, "grad_norm": 8.305484409899465, "learning_rate": 2.934639862612385e-07, "loss": 0.3401, "step": 19220 }, { "epoch": 0.9711875962728214, "grad_norm": 4.5596702016391575, "learning_rate": 2.8841297100717247e-07, "loss": 0.3453, "step": 19230 }, { "epoch": 0.9716926340243933, "grad_norm": 3.1404063614755056, "learning_rate": 2.833619557531064e-07, "loss": 0.3568, "step": 19240 }, { "epoch": 0.9721976717759653, "grad_norm": 2.1279962723109542, "learning_rate": 2.7831094049904034e-07, "loss": 0.3652, "step": 19250 }, { "epoch": 0.9727027095275372, "grad_norm": 8.103709676163785, "learning_rate": 2.7325992524497426e-07, "loss": 0.3599, "step": 19260 }, { "epoch": 0.9732077472791091, "grad_norm": 3.9079606526402695, "learning_rate": 2.6820890999090817e-07, "loss": 0.3332, "step": 19270 }, { "epoch": 0.973712785030681, "grad_norm": 2.834072428824953, "learning_rate": 2.6315789473684213e-07, "loss": 0.3587, "step": 19280 }, { "epoch": 0.974217822782253, "grad_norm": 3.331437469521485, "learning_rate": 2.5810687948277604e-07, "loss": 0.3532, "step": 19290 }, { "epoch": 0.974722860533825, "grad_norm": 4.5771090753301955, "learning_rate": 2.5305586422871e-07, "loss": 0.3619, "step": 19300 }, { "epoch": 0.9752278982853968, "grad_norm": 2.1169984635243897, "learning_rate": 2.480048489746439e-07, "loss": 0.3338, "step": 19310 }, { "epoch": 0.9757329360369688, "grad_norm": 2.4727042377727075, "learning_rate": 2.429538337205779e-07, "loss": 0.348, "step": 19320 }, { "epoch": 0.9762379737885407, "grad_norm": 4.382104299041976, "learning_rate": 2.3790281846651177e-07, "loss": 0.3608, "step": 19330 }, { "epoch": 0.9767430115401127, "grad_norm": 24.576345877505307, "learning_rate": 2.3285180321244574e-07, "loss": 0.3344, "step": 19340 }, { "epoch": 0.9772480492916845, "grad_norm": 5.068003336801587, "learning_rate": 2.2780078795837965e-07, "loss": 0.34, "step": 19350 }, { "epoch": 0.9777530870432565, "grad_norm": 5.036323992558192, "learning_rate": 2.2274977270431359e-07, "loss": 0.3534, "step": 19360 }, { "epoch": 0.9782581247948284, "grad_norm": 8.0901350202218, "learning_rate": 2.1769875745024752e-07, "loss": 0.3563, "step": 19370 }, { "epoch": 0.9787631625464004, "grad_norm": 3.2338907871203575, "learning_rate": 2.1264774219618146e-07, "loss": 0.342, "step": 19380 }, { "epoch": 0.9792682002979722, "grad_norm": 7.152607151035173, "learning_rate": 2.0759672694211537e-07, "loss": 0.3585, "step": 19390 }, { "epoch": 0.9797732380495442, "grad_norm": 2.8408399191335234, "learning_rate": 2.025457116880493e-07, "loss": 0.3408, "step": 19400 }, { "epoch": 0.9802782758011162, "grad_norm": 3.335420575405322, "learning_rate": 1.9749469643398325e-07, "loss": 0.3626, "step": 19410 }, { "epoch": 0.9807833135526881, "grad_norm": 4.427107484896195, "learning_rate": 1.9244368117991716e-07, "loss": 0.3385, "step": 19420 }, { "epoch": 0.98128835130426, "grad_norm": 5.431153322507852, "learning_rate": 1.8739266592585113e-07, "loss": 0.3513, "step": 19430 }, { "epoch": 0.9817933890558319, "grad_norm": 3.813640172056062, "learning_rate": 1.8234165067178504e-07, "loss": 0.3393, "step": 19440 }, { "epoch": 0.9822984268074039, "grad_norm": 3.898589096762485, "learning_rate": 1.7729063541771895e-07, "loss": 0.3405, "step": 19450 }, { "epoch": 0.9828034645589758, "grad_norm": 3.037916644567326, "learning_rate": 1.7223962016365292e-07, "loss": 0.3519, "step": 19460 }, { "epoch": 0.9833085023105477, "grad_norm": 3.190273896432863, "learning_rate": 1.6718860490958683e-07, "loss": 0.3533, "step": 19470 }, { "epoch": 0.9838135400621196, "grad_norm": 5.858585403848123, "learning_rate": 1.621375896555208e-07, "loss": 0.3443, "step": 19480 }, { "epoch": 0.9843185778136916, "grad_norm": 3.441915956155067, "learning_rate": 1.570865744014547e-07, "loss": 0.3554, "step": 19490 }, { "epoch": 0.9848236155652635, "grad_norm": 3.7424368716852348, "learning_rate": 1.5203555914738864e-07, "loss": 0.3587, "step": 19500 }, { "epoch": 0.9853286533168354, "grad_norm": 7.071299158156384, "learning_rate": 1.4698454389332256e-07, "loss": 0.3621, "step": 19510 }, { "epoch": 0.9858336910684073, "grad_norm": 3.8198683754171587, "learning_rate": 1.419335286392565e-07, "loss": 0.3724, "step": 19520 }, { "epoch": 0.9863387288199793, "grad_norm": 2.731071788924555, "learning_rate": 1.3688251338519043e-07, "loss": 0.3416, "step": 19530 }, { "epoch": 0.9868437665715513, "grad_norm": 3.7212383700962652, "learning_rate": 1.3183149813112437e-07, "loss": 0.3473, "step": 19540 }, { "epoch": 0.9873488043231231, "grad_norm": 3.302454861812917, "learning_rate": 1.267804828770583e-07, "loss": 0.3567, "step": 19550 }, { "epoch": 0.9878538420746951, "grad_norm": 13.424373803274289, "learning_rate": 1.2172946762299225e-07, "loss": 0.3462, "step": 19560 }, { "epoch": 0.988358879826267, "grad_norm": 2.5094569211373683, "learning_rate": 1.1667845236892617e-07, "loss": 0.3506, "step": 19570 }, { "epoch": 0.988863917577839, "grad_norm": 8.676691313011856, "learning_rate": 1.116274371148601e-07, "loss": 0.3399, "step": 19580 }, { "epoch": 0.9893689553294108, "grad_norm": 5.111065406394146, "learning_rate": 1.0657642186079402e-07, "loss": 0.3486, "step": 19590 }, { "epoch": 0.9898739930809828, "grad_norm": 3.1643666611399976, "learning_rate": 1.0152540660672796e-07, "loss": 0.3463, "step": 19600 }, { "epoch": 0.9903790308325547, "grad_norm": 3.7401113905250845, "learning_rate": 9.647439135266189e-08, "loss": 0.3462, "step": 19610 }, { "epoch": 0.9908840685841267, "grad_norm": 3.1161613433451696, "learning_rate": 9.142337609859582e-08, "loss": 0.3595, "step": 19620 }, { "epoch": 0.9913891063356985, "grad_norm": 7.461525216502298, "learning_rate": 8.637236084452976e-08, "loss": 0.3478, "step": 19630 }, { "epoch": 0.9918941440872705, "grad_norm": 3.278360748256368, "learning_rate": 8.13213455904637e-08, "loss": 0.3468, "step": 19640 }, { "epoch": 0.9923991818388425, "grad_norm": 2.3662646040461026, "learning_rate": 7.627033033639761e-08, "loss": 0.3334, "step": 19650 }, { "epoch": 0.9929042195904144, "grad_norm": 4.061819314850138, "learning_rate": 7.121931508233155e-08, "loss": 0.339, "step": 19660 }, { "epoch": 0.9934092573419863, "grad_norm": 3.0268165058919387, "learning_rate": 6.616829982826549e-08, "loss": 0.3358, "step": 19670 }, { "epoch": 0.9939142950935582, "grad_norm": 4.547975258460046, "learning_rate": 6.111728457419941e-08, "loss": 0.3338, "step": 19680 }, { "epoch": 0.9944193328451302, "grad_norm": 5.770374809427592, "learning_rate": 5.606626932013335e-08, "loss": 0.3448, "step": 19690 }, { "epoch": 0.9949243705967021, "grad_norm": 2.864633009769063, "learning_rate": 5.1015254066067285e-08, "loss": 0.3305, "step": 19700 }, { "epoch": 0.995429408348274, "grad_norm": 2.15308465344697, "learning_rate": 4.5964238812001217e-08, "loss": 0.3756, "step": 19710 }, { "epoch": 0.9959344460998459, "grad_norm": 4.787969766327734, "learning_rate": 4.091322355793514e-08, "loss": 0.343, "step": 19720 }, { "epoch": 0.9964394838514179, "grad_norm": 9.985119863463977, "learning_rate": 3.586220830386908e-08, "loss": 0.3436, "step": 19730 }, { "epoch": 0.9969445216029899, "grad_norm": 2.6886443082611673, "learning_rate": 3.081119304980301e-08, "loss": 0.3517, "step": 19740 }, { "epoch": 0.9974495593545617, "grad_norm": 7.198367620435292, "learning_rate": 2.5760177795736944e-08, "loss": 0.3608, "step": 19750 }, { "epoch": 0.9979545971061337, "grad_norm": 2.4433957061834106, "learning_rate": 2.0709162541670875e-08, "loss": 0.3587, "step": 19760 }, { "epoch": 0.9984596348577056, "grad_norm": 43.92952696515746, "learning_rate": 1.565814728760481e-08, "loss": 0.352, "step": 19770 }, { "epoch": 0.9989646726092776, "grad_norm": 5.395974747202963, "learning_rate": 1.0607132033538742e-08, "loss": 0.3339, "step": 19780 }, { "epoch": 0.9994697103608495, "grad_norm": 2.2549607655291393, "learning_rate": 5.556116779472674e-09, "loss": 0.3437, "step": 19790 }, { "epoch": 0.9999747481124214, "grad_norm": 4.084387999304972, "learning_rate": 5.051015254066067e-10, "loss": 0.3312, "step": 19800 } ], "logging_steps": 10, "max_steps": 19800, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.475990740795392e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }