{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999711408040172, "eval_steps": 500, "global_step": 17325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005771839196559984, "grad_norm": 15.207763525645962, "learning_rate": 9.995959129481038e-06, "loss": 2.6, "step": 10 }, { "epoch": 0.0011543678393119967, "grad_norm": 12.615616581147215, "learning_rate": 9.990186457311089e-06, "loss": 0.7554, "step": 20 }, { "epoch": 0.0017315517589679952, "grad_norm": 38.969453309289854, "learning_rate": 9.984413785141142e-06, "loss": 0.6704, "step": 30 }, { "epoch": 0.0023087356786239935, "grad_norm": 22.432338112146017, "learning_rate": 9.978641112971195e-06, "loss": 0.6122, "step": 40 }, { "epoch": 0.0028859195982799918, "grad_norm": 6.366544502174357, "learning_rate": 9.972868440801248e-06, "loss": 0.5704, "step": 50 }, { "epoch": 0.0034631035179359905, "grad_norm": 5.991950420382146, "learning_rate": 9.967095768631301e-06, "loss": 0.5725, "step": 60 }, { "epoch": 0.004040287437591989, "grad_norm": 27.360506966186097, "learning_rate": 9.961323096461352e-06, "loss": 0.5616, "step": 70 }, { "epoch": 0.004617471357247987, "grad_norm": 7.6551554884885835, "learning_rate": 9.955550424291405e-06, "loss": 0.5581, "step": 80 }, { "epoch": 0.005194655276903985, "grad_norm": 5.380738579039347, "learning_rate": 9.949777752121458e-06, "loss": 0.5386, "step": 90 }, { "epoch": 0.0057718391965599835, "grad_norm": 14.701566820920394, "learning_rate": 9.944005079951511e-06, "loss": 0.5277, "step": 100 }, { "epoch": 0.006349023116215982, "grad_norm": 24.2926776932226, "learning_rate": 9.938232407781563e-06, "loss": 0.534, "step": 110 }, { "epoch": 0.006926207035871981, "grad_norm": 18.333541125312422, "learning_rate": 9.932459735611616e-06, "loss": 0.5357, "step": 120 }, { "epoch": 0.007503390955527979, "grad_norm": 4.413124779621301, "learning_rate": 9.926687063441667e-06, "loss": 0.5189, "step": 130 }, { "epoch": 0.008080574875183977, "grad_norm": 6.736449877624926, "learning_rate": 9.920914391271722e-06, "loss": 0.5089, "step": 140 }, { "epoch": 0.008657758794839977, "grad_norm": 7.499085213610092, "learning_rate": 9.915141719101773e-06, "loss": 0.515, "step": 150 }, { "epoch": 0.009234942714495974, "grad_norm": 7.213309423254868, "learning_rate": 9.909369046931826e-06, "loss": 0.516, "step": 160 }, { "epoch": 0.009812126634151973, "grad_norm": 6.224375486360128, "learning_rate": 9.903596374761877e-06, "loss": 0.4787, "step": 170 }, { "epoch": 0.01038931055380797, "grad_norm": 4.666329365397574, "learning_rate": 9.89782370259193e-06, "loss": 0.5457, "step": 180 }, { "epoch": 0.01096649447346397, "grad_norm": 7.269438501572431, "learning_rate": 9.892051030421983e-06, "loss": 0.4999, "step": 190 }, { "epoch": 0.011543678393119967, "grad_norm": 3.4282676515510633, "learning_rate": 9.886278358252037e-06, "loss": 0.4979, "step": 200 }, { "epoch": 0.012120862312775966, "grad_norm": 6.398684245798025, "learning_rate": 9.880505686082088e-06, "loss": 0.5097, "step": 210 }, { "epoch": 0.012698046232431964, "grad_norm": 9.308090815837401, "learning_rate": 9.874733013912141e-06, "loss": 0.5003, "step": 220 }, { "epoch": 0.013275230152087963, "grad_norm": 6.572806422530607, "learning_rate": 9.868960341742194e-06, "loss": 0.5151, "step": 230 }, { "epoch": 0.013852414071743962, "grad_norm": 3.9790215956458206, "learning_rate": 9.863187669572247e-06, "loss": 0.5036, "step": 240 }, { "epoch": 0.01442959799139996, "grad_norm": 9.129499916264713, "learning_rate": 9.857414997402298e-06, "loss": 0.4993, "step": 250 }, { "epoch": 0.015006781911055958, "grad_norm": 5.341270073182352, "learning_rate": 9.851642325232351e-06, "loss": 0.5068, "step": 260 }, { "epoch": 0.015583965830711956, "grad_norm": 8.88171344021306, "learning_rate": 9.845869653062403e-06, "loss": 0.5106, "step": 270 }, { "epoch": 0.016161149750367955, "grad_norm": 6.256421713727477, "learning_rate": 9.840096980892456e-06, "loss": 0.5012, "step": 280 }, { "epoch": 0.016738333670023954, "grad_norm": 14.828284045011356, "learning_rate": 9.834324308722509e-06, "loss": 0.4916, "step": 290 }, { "epoch": 0.017315517589679953, "grad_norm": 13.134769047818303, "learning_rate": 9.828551636552562e-06, "loss": 0.5023, "step": 300 }, { "epoch": 0.01789270150933595, "grad_norm": 13.132579195124707, "learning_rate": 9.822778964382613e-06, "loss": 0.4979, "step": 310 }, { "epoch": 0.018469885428991948, "grad_norm": 7.799223760078651, "learning_rate": 9.817006292212666e-06, "loss": 0.5047, "step": 320 }, { "epoch": 0.019047069348647947, "grad_norm": 9.15638603603503, "learning_rate": 9.811233620042719e-06, "loss": 0.5089, "step": 330 }, { "epoch": 0.019624253268303946, "grad_norm": 4.959585555459444, "learning_rate": 9.805460947872772e-06, "loss": 0.4952, "step": 340 }, { "epoch": 0.020201437187959942, "grad_norm": 11.68075308396199, "learning_rate": 9.799688275702823e-06, "loss": 0.4947, "step": 350 }, { "epoch": 0.02077862110761594, "grad_norm": 5.031742433330389, "learning_rate": 9.793915603532876e-06, "loss": 0.4722, "step": 360 }, { "epoch": 0.02135580502727194, "grad_norm": 7.396670940592179, "learning_rate": 9.788142931362928e-06, "loss": 0.4784, "step": 370 }, { "epoch": 0.02193298894692794, "grad_norm": 12.81464819318774, "learning_rate": 9.78237025919298e-06, "loss": 0.5016, "step": 380 }, { "epoch": 0.02251017286658394, "grad_norm": 6.952612493218752, "learning_rate": 9.776597587023034e-06, "loss": 0.486, "step": 390 }, { "epoch": 0.023087356786239934, "grad_norm": 3.8994319647851134, "learning_rate": 9.770824914853087e-06, "loss": 0.5066, "step": 400 }, { "epoch": 0.023664540705895933, "grad_norm": 8.754480087072261, "learning_rate": 9.765052242683138e-06, "loss": 0.477, "step": 410 }, { "epoch": 0.024241724625551932, "grad_norm": 14.00837557848952, "learning_rate": 9.759279570513191e-06, "loss": 0.4698, "step": 420 }, { "epoch": 0.02481890854520793, "grad_norm": 17.668700193812345, "learning_rate": 9.753506898343244e-06, "loss": 0.4809, "step": 430 }, { "epoch": 0.025396092464863927, "grad_norm": 7.562292913968297, "learning_rate": 9.747734226173297e-06, "loss": 0.5051, "step": 440 }, { "epoch": 0.025973276384519926, "grad_norm": 10.300306620163619, "learning_rate": 9.741961554003348e-06, "loss": 0.5036, "step": 450 }, { "epoch": 0.026550460304175925, "grad_norm": 10.310969163346487, "learning_rate": 9.736188881833401e-06, "loss": 0.5029, "step": 460 }, { "epoch": 0.027127644223831925, "grad_norm": 5.858481709305026, "learning_rate": 9.730416209663453e-06, "loss": 0.5021, "step": 470 }, { "epoch": 0.027704828143487924, "grad_norm": 92.82221880785818, "learning_rate": 9.724643537493506e-06, "loss": 0.487, "step": 480 }, { "epoch": 0.02828201206314392, "grad_norm": 5.060105328004147, "learning_rate": 9.718870865323559e-06, "loss": 0.5053, "step": 490 }, { "epoch": 0.02885919598279992, "grad_norm": 5.269210555899195, "learning_rate": 9.713098193153612e-06, "loss": 0.5028, "step": 500 }, { "epoch": 0.029436379902455918, "grad_norm": 2.809713279481296, "learning_rate": 9.707325520983663e-06, "loss": 0.4789, "step": 510 }, { "epoch": 0.030013563822111917, "grad_norm": 5.364421325217877, "learning_rate": 9.701552848813716e-06, "loss": 0.488, "step": 520 }, { "epoch": 0.030590747741767916, "grad_norm": 81.42225421545997, "learning_rate": 9.69578017664377e-06, "loss": 0.4923, "step": 530 }, { "epoch": 0.03116793166142391, "grad_norm": 5.472841855533951, "learning_rate": 9.690007504473822e-06, "loss": 0.4801, "step": 540 }, { "epoch": 0.03174511558107991, "grad_norm": 9.314853911550161, "learning_rate": 9.684234832303874e-06, "loss": 0.4889, "step": 550 }, { "epoch": 0.03232229950073591, "grad_norm": 5.657736368331141, "learning_rate": 9.678462160133927e-06, "loss": 0.468, "step": 560 }, { "epoch": 0.03289948342039191, "grad_norm": 5.316343227859894, "learning_rate": 9.672689487963978e-06, "loss": 0.4636, "step": 570 }, { "epoch": 0.03347666734004791, "grad_norm": 6.818051213532348, "learning_rate": 9.666916815794033e-06, "loss": 0.4785, "step": 580 }, { "epoch": 0.03405385125970391, "grad_norm": 5.515733833721638, "learning_rate": 9.661144143624086e-06, "loss": 0.4676, "step": 590 }, { "epoch": 0.034631035179359906, "grad_norm": 4.414342893400196, "learning_rate": 9.655371471454137e-06, "loss": 0.4536, "step": 600 }, { "epoch": 0.0352082190990159, "grad_norm": 5.5253782212489355, "learning_rate": 9.64959879928419e-06, "loss": 0.4831, "step": 610 }, { "epoch": 0.0357854030186719, "grad_norm": 3.47774426972913, "learning_rate": 9.643826127114241e-06, "loss": 0.4877, "step": 620 }, { "epoch": 0.0363625869383279, "grad_norm": 7.150841155138223, "learning_rate": 9.638053454944294e-06, "loss": 0.4629, "step": 630 }, { "epoch": 0.036939770857983896, "grad_norm": 3.7894713504219357, "learning_rate": 9.632280782774347e-06, "loss": 0.4816, "step": 640 }, { "epoch": 0.037516954777639895, "grad_norm": 5.346386715612294, "learning_rate": 9.6265081106044e-06, "loss": 0.4861, "step": 650 }, { "epoch": 0.038094138697295894, "grad_norm": 4.822544974681145, "learning_rate": 9.620735438434452e-06, "loss": 0.4756, "step": 660 }, { "epoch": 0.03867132261695189, "grad_norm": 5.810598113453792, "learning_rate": 9.614962766264505e-06, "loss": 0.4752, "step": 670 }, { "epoch": 0.03924850653660789, "grad_norm": 19.319543215067025, "learning_rate": 9.609190094094558e-06, "loss": 0.4664, "step": 680 }, { "epoch": 0.03982569045626389, "grad_norm": 3.6009359491010864, "learning_rate": 9.60341742192461e-06, "loss": 0.481, "step": 690 }, { "epoch": 0.040402874375919884, "grad_norm": 5.102687170049628, "learning_rate": 9.597644749754662e-06, "loss": 0.4751, "step": 700 }, { "epoch": 0.04098005829557588, "grad_norm": 6.202922207392348, "learning_rate": 9.591872077584715e-06, "loss": 0.4849, "step": 710 }, { "epoch": 0.04155724221523188, "grad_norm": 7.8448485511355965, "learning_rate": 9.586099405414766e-06, "loss": 0.4947, "step": 720 }, { "epoch": 0.04213442613488788, "grad_norm": 3.9340219935160863, "learning_rate": 9.58032673324482e-06, "loss": 0.4948, "step": 730 }, { "epoch": 0.04271161005454388, "grad_norm": 4.596617570306747, "learning_rate": 9.574554061074873e-06, "loss": 0.4743, "step": 740 }, { "epoch": 0.04328879397419988, "grad_norm": 2.1114969175976923, "learning_rate": 9.568781388904926e-06, "loss": 0.4717, "step": 750 }, { "epoch": 0.04386597789385588, "grad_norm": 5.18107474670299, "learning_rate": 9.563008716734977e-06, "loss": 0.468, "step": 760 }, { "epoch": 0.04444316181351188, "grad_norm": 4.705370463352637, "learning_rate": 9.55723604456503e-06, "loss": 0.4594, "step": 770 }, { "epoch": 0.04502034573316788, "grad_norm": 4.477204626343746, "learning_rate": 9.551463372395083e-06, "loss": 0.4726, "step": 780 }, { "epoch": 0.04559752965282387, "grad_norm": 5.11055150918499, "learning_rate": 9.545690700225136e-06, "loss": 0.4878, "step": 790 }, { "epoch": 0.04617471357247987, "grad_norm": 5.92970242815697, "learning_rate": 9.539918028055187e-06, "loss": 0.4562, "step": 800 }, { "epoch": 0.04675189749213587, "grad_norm": 32.04814941479638, "learning_rate": 9.53414535588524e-06, "loss": 0.4668, "step": 810 }, { "epoch": 0.047329081411791867, "grad_norm": 7.483269872312162, "learning_rate": 9.528372683715292e-06, "loss": 0.4593, "step": 820 }, { "epoch": 0.047906265331447866, "grad_norm": 3.0548203710383026, "learning_rate": 9.522600011545345e-06, "loss": 0.4734, "step": 830 }, { "epoch": 0.048483449251103865, "grad_norm": 2.846647873568613, "learning_rate": 9.516827339375398e-06, "loss": 0.4583, "step": 840 }, { "epoch": 0.049060633170759864, "grad_norm": 2.9958931469528753, "learning_rate": 9.51105466720545e-06, "loss": 0.4503, "step": 850 }, { "epoch": 0.04963781709041586, "grad_norm": 3.669267079128399, "learning_rate": 9.505281995035502e-06, "loss": 0.4543, "step": 860 }, { "epoch": 0.05021500101007186, "grad_norm": 4.959841634234083, "learning_rate": 9.499509322865555e-06, "loss": 0.4638, "step": 870 }, { "epoch": 0.050792184929727854, "grad_norm": 6.2400754071583355, "learning_rate": 9.493736650695608e-06, "loss": 0.4675, "step": 880 }, { "epoch": 0.051369368849383854, "grad_norm": 4.8904554144848325, "learning_rate": 9.487963978525661e-06, "loss": 0.4795, "step": 890 }, { "epoch": 0.05194655276903985, "grad_norm": 5.718936473226969, "learning_rate": 9.482191306355712e-06, "loss": 0.475, "step": 900 }, { "epoch": 0.05252373668869585, "grad_norm": 2.813275923923208, "learning_rate": 9.476418634185765e-06, "loss": 0.4588, "step": 910 }, { "epoch": 0.05310092060835185, "grad_norm": 3.7622872130635825, "learning_rate": 9.470645962015817e-06, "loss": 0.4738, "step": 920 }, { "epoch": 0.05367810452800785, "grad_norm": 5.251759558774021, "learning_rate": 9.464873289845871e-06, "loss": 0.4548, "step": 930 }, { "epoch": 0.05425528844766385, "grad_norm": 4.650175199113373, "learning_rate": 9.459100617675923e-06, "loss": 0.469, "step": 940 }, { "epoch": 0.05483247236731985, "grad_norm": 19.07121359753558, "learning_rate": 9.453327945505976e-06, "loss": 0.4593, "step": 950 }, { "epoch": 0.05540965628697585, "grad_norm": 4.899239522938927, "learning_rate": 9.447555273336027e-06, "loss": 0.4693, "step": 960 }, { "epoch": 0.05598684020663185, "grad_norm": 11.615659586114845, "learning_rate": 9.44178260116608e-06, "loss": 0.4447, "step": 970 }, { "epoch": 0.05656402412628784, "grad_norm": 10.04941518596728, "learning_rate": 9.436009928996133e-06, "loss": 0.4617, "step": 980 }, { "epoch": 0.05714120804594384, "grad_norm": 8.509458377026059, "learning_rate": 9.430237256826186e-06, "loss": 0.4449, "step": 990 }, { "epoch": 0.05771839196559984, "grad_norm": 7.280354559581083, "learning_rate": 9.424464584656238e-06, "loss": 0.4473, "step": 1000 }, { "epoch": 0.058295575885255836, "grad_norm": 6.178315104553298, "learning_rate": 9.41869191248629e-06, "loss": 0.4489, "step": 1010 }, { "epoch": 0.058872759804911835, "grad_norm": 4.326714170134293, "learning_rate": 9.412919240316344e-06, "loss": 0.4421, "step": 1020 }, { "epoch": 0.059449943724567834, "grad_norm": 5.121692072940591, "learning_rate": 9.407146568146397e-06, "loss": 0.4474, "step": 1030 }, { "epoch": 0.060027127644223834, "grad_norm": 9.081808249358685, "learning_rate": 9.401373895976448e-06, "loss": 0.4425, "step": 1040 }, { "epoch": 0.06060431156387983, "grad_norm": 3.1699426939861644, "learning_rate": 9.395601223806501e-06, "loss": 0.4527, "step": 1050 }, { "epoch": 0.06118149548353583, "grad_norm": 4.867955044244513, "learning_rate": 9.389828551636552e-06, "loss": 0.4364, "step": 1060 }, { "epoch": 0.061758679403191824, "grad_norm": 4.757339575426131, "learning_rate": 9.384055879466605e-06, "loss": 0.4744, "step": 1070 }, { "epoch": 0.06233586332284782, "grad_norm": 7.0039435743401235, "learning_rate": 9.378283207296658e-06, "loss": 0.4542, "step": 1080 }, { "epoch": 0.06291304724250382, "grad_norm": 3.3067436767056964, "learning_rate": 9.372510535126711e-06, "loss": 0.4434, "step": 1090 }, { "epoch": 0.06349023116215982, "grad_norm": 5.215962094810896, "learning_rate": 9.366737862956763e-06, "loss": 0.4413, "step": 1100 }, { "epoch": 0.06406741508181582, "grad_norm": 5.30554157155974, "learning_rate": 9.360965190786816e-06, "loss": 0.4492, "step": 1110 }, { "epoch": 0.06464459900147182, "grad_norm": 9.282132535227714, "learning_rate": 9.355192518616869e-06, "loss": 0.4552, "step": 1120 }, { "epoch": 0.06522178292112782, "grad_norm": 4.436227742317887, "learning_rate": 9.349419846446922e-06, "loss": 0.4629, "step": 1130 }, { "epoch": 0.06579896684078382, "grad_norm": 5.072582901364672, "learning_rate": 9.343647174276975e-06, "loss": 0.4394, "step": 1140 }, { "epoch": 0.06637615076043982, "grad_norm": 3.6484490415265287, "learning_rate": 9.337874502107026e-06, "loss": 0.4404, "step": 1150 }, { "epoch": 0.06695333468009582, "grad_norm": 5.179047870445265, "learning_rate": 9.332101829937079e-06, "loss": 0.4401, "step": 1160 }, { "epoch": 0.06753051859975182, "grad_norm": 5.1113945913805345, "learning_rate": 9.32632915776713e-06, "loss": 0.4642, "step": 1170 }, { "epoch": 0.06810770251940781, "grad_norm": 13.536539735261888, "learning_rate": 9.320556485597183e-06, "loss": 0.44, "step": 1180 }, { "epoch": 0.06868488643906381, "grad_norm": 39.38178075944957, "learning_rate": 9.314783813427236e-06, "loss": 0.4401, "step": 1190 }, { "epoch": 0.06926207035871981, "grad_norm": 5.347846132280397, "learning_rate": 9.30901114125729e-06, "loss": 0.4279, "step": 1200 }, { "epoch": 0.0698392542783758, "grad_norm": 17.998657342305947, "learning_rate": 9.30323846908734e-06, "loss": 0.4289, "step": 1210 }, { "epoch": 0.0704164381980318, "grad_norm": 3.352558494444376, "learning_rate": 9.297465796917394e-06, "loss": 0.4345, "step": 1220 }, { "epoch": 0.0709936221176878, "grad_norm": 5.425024450686575, "learning_rate": 9.291693124747447e-06, "loss": 0.4551, "step": 1230 }, { "epoch": 0.0715708060373438, "grad_norm": 5.053276208988115, "learning_rate": 9.2859204525775e-06, "loss": 0.455, "step": 1240 }, { "epoch": 0.0721479899569998, "grad_norm": 4.369475575649534, "learning_rate": 9.280147780407551e-06, "loss": 0.4359, "step": 1250 }, { "epoch": 0.0727251738766558, "grad_norm": 7.966064775548766, "learning_rate": 9.274375108237604e-06, "loss": 0.4408, "step": 1260 }, { "epoch": 0.07330235779631179, "grad_norm": 2.9840558262546626, "learning_rate": 9.268602436067656e-06, "loss": 0.445, "step": 1270 }, { "epoch": 0.07387954171596779, "grad_norm": 6.923266339159757, "learning_rate": 9.26282976389771e-06, "loss": 0.4319, "step": 1280 }, { "epoch": 0.07445672563562379, "grad_norm": 5.445323456459659, "learning_rate": 9.257057091727762e-06, "loss": 0.4649, "step": 1290 }, { "epoch": 0.07503390955527979, "grad_norm": 2.258392109239514, "learning_rate": 9.251284419557815e-06, "loss": 0.4435, "step": 1300 }, { "epoch": 0.07561109347493579, "grad_norm": 6.963934239231957, "learning_rate": 9.245511747387866e-06, "loss": 0.4516, "step": 1310 }, { "epoch": 0.07618827739459179, "grad_norm": 3.741922936601378, "learning_rate": 9.239739075217919e-06, "loss": 0.4457, "step": 1320 }, { "epoch": 0.07676546131424779, "grad_norm": 4.697657485962023, "learning_rate": 9.233966403047972e-06, "loss": 0.4585, "step": 1330 }, { "epoch": 0.07734264523390379, "grad_norm": 2.7800726567106886, "learning_rate": 9.228193730878025e-06, "loss": 0.4303, "step": 1340 }, { "epoch": 0.07791982915355979, "grad_norm": 2.374574795518818, "learning_rate": 9.222421058708076e-06, "loss": 0.4468, "step": 1350 }, { "epoch": 0.07849701307321579, "grad_norm": 3.379706889838213, "learning_rate": 9.21664838653813e-06, "loss": 0.4468, "step": 1360 }, { "epoch": 0.07907419699287178, "grad_norm": 5.02793266796178, "learning_rate": 9.210875714368182e-06, "loss": 0.4541, "step": 1370 }, { "epoch": 0.07965138091252778, "grad_norm": 5.377287835326619, "learning_rate": 9.205103042198235e-06, "loss": 0.4577, "step": 1380 }, { "epoch": 0.08022856483218377, "grad_norm": 4.118238939930749, "learning_rate": 9.199330370028287e-06, "loss": 0.4535, "step": 1390 }, { "epoch": 0.08080574875183977, "grad_norm": 3.6462071059200785, "learning_rate": 9.19355769785834e-06, "loss": 0.452, "step": 1400 }, { "epoch": 0.08138293267149577, "grad_norm": 2.023709004387077, "learning_rate": 9.187785025688391e-06, "loss": 0.4385, "step": 1410 }, { "epoch": 0.08196011659115177, "grad_norm": 2.718469268180074, "learning_rate": 9.182012353518444e-06, "loss": 0.4565, "step": 1420 }, { "epoch": 0.08253730051080777, "grad_norm": 4.494982724398743, "learning_rate": 9.176239681348497e-06, "loss": 0.455, "step": 1430 }, { "epoch": 0.08311448443046376, "grad_norm": 3.008004247279657, "learning_rate": 9.17046700917855e-06, "loss": 0.4338, "step": 1440 }, { "epoch": 0.08369166835011976, "grad_norm": 4.471511152653035, "learning_rate": 9.164694337008601e-06, "loss": 0.4498, "step": 1450 }, { "epoch": 0.08426885226977576, "grad_norm": 41.046308308564996, "learning_rate": 9.158921664838654e-06, "loss": 0.444, "step": 1460 }, { "epoch": 0.08484603618943176, "grad_norm": 2.2817565591543087, "learning_rate": 9.153148992668707e-06, "loss": 0.4524, "step": 1470 }, { "epoch": 0.08542322010908776, "grad_norm": 2.7552178530343974, "learning_rate": 9.14737632049876e-06, "loss": 0.4395, "step": 1480 }, { "epoch": 0.08600040402874376, "grad_norm": 7.350119226751549, "learning_rate": 9.141603648328812e-06, "loss": 0.4439, "step": 1490 }, { "epoch": 0.08657758794839976, "grad_norm": 4.927329771744563, "learning_rate": 9.135830976158865e-06, "loss": 0.4435, "step": 1500 }, { "epoch": 0.08715477186805576, "grad_norm": 3.2061592885446433, "learning_rate": 9.130058303988916e-06, "loss": 0.4551, "step": 1510 }, { "epoch": 0.08773195578771176, "grad_norm": 2.81815402264405, "learning_rate": 9.12428563181897e-06, "loss": 0.4541, "step": 1520 }, { "epoch": 0.08830913970736776, "grad_norm": 1.8995196764329627, "learning_rate": 9.118512959649022e-06, "loss": 0.426, "step": 1530 }, { "epoch": 0.08888632362702376, "grad_norm": 4.57597268577496, "learning_rate": 9.112740287479075e-06, "loss": 0.4388, "step": 1540 }, { "epoch": 0.08946350754667975, "grad_norm": 4.888253541319005, "learning_rate": 9.106967615309127e-06, "loss": 0.4557, "step": 1550 }, { "epoch": 0.09004069146633575, "grad_norm": 2.7824915648882853, "learning_rate": 9.10119494313918e-06, "loss": 0.4544, "step": 1560 }, { "epoch": 0.09061787538599175, "grad_norm": 4.3535000068694645, "learning_rate": 9.095422270969233e-06, "loss": 0.4545, "step": 1570 }, { "epoch": 0.09119505930564774, "grad_norm": 3.465323902631204, "learning_rate": 9.089649598799286e-06, "loss": 0.4343, "step": 1580 }, { "epoch": 0.09177224322530374, "grad_norm": 3.685921584283666, "learning_rate": 9.083876926629337e-06, "loss": 0.4571, "step": 1590 }, { "epoch": 0.09234942714495974, "grad_norm": 3.7270367548310457, "learning_rate": 9.07810425445939e-06, "loss": 0.4368, "step": 1600 }, { "epoch": 0.09292661106461574, "grad_norm": 4.635316351567143, "learning_rate": 9.072331582289441e-06, "loss": 0.4333, "step": 1610 }, { "epoch": 0.09350379498427173, "grad_norm": 7.835128472709014, "learning_rate": 9.066558910119494e-06, "loss": 0.4371, "step": 1620 }, { "epoch": 0.09408097890392773, "grad_norm": 3.497453269659607, "learning_rate": 9.060786237949547e-06, "loss": 0.441, "step": 1630 }, { "epoch": 0.09465816282358373, "grad_norm": 8.942601742638523, "learning_rate": 9.0550135657796e-06, "loss": 0.4381, "step": 1640 }, { "epoch": 0.09523534674323973, "grad_norm": 4.589054052495082, "learning_rate": 9.049240893609653e-06, "loss": 0.4382, "step": 1650 }, { "epoch": 0.09581253066289573, "grad_norm": 5.380659756568862, "learning_rate": 9.043468221439705e-06, "loss": 0.4409, "step": 1660 }, { "epoch": 0.09638971458255173, "grad_norm": 2.868711750692323, "learning_rate": 9.037695549269758e-06, "loss": 0.4456, "step": 1670 }, { "epoch": 0.09696689850220773, "grad_norm": 2.5446684388172884, "learning_rate": 9.03192287709981e-06, "loss": 0.4491, "step": 1680 }, { "epoch": 0.09754408242186373, "grad_norm": 2.258241750087974, "learning_rate": 9.026150204929864e-06, "loss": 0.4366, "step": 1690 }, { "epoch": 0.09812126634151973, "grad_norm": 2.602005403010541, "learning_rate": 9.020377532759915e-06, "loss": 0.4336, "step": 1700 }, { "epoch": 0.09869845026117573, "grad_norm": 2.209187153695394, "learning_rate": 9.014604860589968e-06, "loss": 0.4438, "step": 1710 }, { "epoch": 0.09927563418083173, "grad_norm": 3.0657261742371205, "learning_rate": 9.008832188420021e-06, "loss": 0.4438, "step": 1720 }, { "epoch": 0.09985281810048773, "grad_norm": 3.199466707600606, "learning_rate": 9.003059516250074e-06, "loss": 0.4315, "step": 1730 }, { "epoch": 0.10043000202014372, "grad_norm": 2.6315666639919657, "learning_rate": 8.997286844080125e-06, "loss": 0.4309, "step": 1740 }, { "epoch": 0.10100718593979972, "grad_norm": 2.438482770459306, "learning_rate": 8.991514171910178e-06, "loss": 0.4385, "step": 1750 }, { "epoch": 0.10158436985945571, "grad_norm": 17.29321542355523, "learning_rate": 8.98574149974023e-06, "loss": 0.4408, "step": 1760 }, { "epoch": 0.10216155377911171, "grad_norm": 4.982684643457552, "learning_rate": 8.979968827570283e-06, "loss": 0.4298, "step": 1770 }, { "epoch": 0.10273873769876771, "grad_norm": 6.799624435849738, "learning_rate": 8.974196155400336e-06, "loss": 0.4334, "step": 1780 }, { "epoch": 0.1033159216184237, "grad_norm": 3.774388255492694, "learning_rate": 8.968423483230389e-06, "loss": 0.4308, "step": 1790 }, { "epoch": 0.1038931055380797, "grad_norm": 10.033128674066695, "learning_rate": 8.96265081106044e-06, "loss": 0.4341, "step": 1800 }, { "epoch": 0.1044702894577357, "grad_norm": 2.5219113776584123, "learning_rate": 8.956878138890493e-06, "loss": 0.4362, "step": 1810 }, { "epoch": 0.1050474733773917, "grad_norm": 2.419786699423789, "learning_rate": 8.951105466720546e-06, "loss": 0.4242, "step": 1820 }, { "epoch": 0.1056246572970477, "grad_norm": 60.08295959626865, "learning_rate": 8.9453327945506e-06, "loss": 0.4385, "step": 1830 }, { "epoch": 0.1062018412167037, "grad_norm": 3.096217392102352, "learning_rate": 8.93956012238065e-06, "loss": 0.4241, "step": 1840 }, { "epoch": 0.1067790251363597, "grad_norm": 2.5832709944291503, "learning_rate": 8.933787450210704e-06, "loss": 0.4361, "step": 1850 }, { "epoch": 0.1073562090560157, "grad_norm": 2.3859955361814302, "learning_rate": 8.928014778040755e-06, "loss": 0.4213, "step": 1860 }, { "epoch": 0.1079333929756717, "grad_norm": 3.223331997025027, "learning_rate": 8.922242105870808e-06, "loss": 0.4489, "step": 1870 }, { "epoch": 0.1085105768953277, "grad_norm": 4.411834387897919, "learning_rate": 8.916469433700861e-06, "loss": 0.4336, "step": 1880 }, { "epoch": 0.1090877608149837, "grad_norm": 9.832585313890915, "learning_rate": 8.910696761530914e-06, "loss": 0.4441, "step": 1890 }, { "epoch": 0.1096649447346397, "grad_norm": 5.365037688190915, "learning_rate": 8.904924089360965e-06, "loss": 0.4367, "step": 1900 }, { "epoch": 0.1102421286542957, "grad_norm": 3.819456675382363, "learning_rate": 8.899151417191018e-06, "loss": 0.4272, "step": 1910 }, { "epoch": 0.1108193125739517, "grad_norm": 3.036048623046518, "learning_rate": 8.893378745021071e-06, "loss": 0.4338, "step": 1920 }, { "epoch": 0.1113964964936077, "grad_norm": 3.2508150432730196, "learning_rate": 8.887606072851124e-06, "loss": 0.4469, "step": 1930 }, { "epoch": 0.1119736804132637, "grad_norm": 2.9078235731412305, "learning_rate": 8.881833400681176e-06, "loss": 0.4452, "step": 1940 }, { "epoch": 0.11255086433291968, "grad_norm": 2.4501912449235648, "learning_rate": 8.876060728511229e-06, "loss": 0.4246, "step": 1950 }, { "epoch": 0.11312804825257568, "grad_norm": 6.242529869364432, "learning_rate": 8.87028805634128e-06, "loss": 0.4459, "step": 1960 }, { "epoch": 0.11370523217223168, "grad_norm": 25.285733450369438, "learning_rate": 8.864515384171333e-06, "loss": 0.4377, "step": 1970 }, { "epoch": 0.11428241609188768, "grad_norm": 3.177411911875137, "learning_rate": 8.858742712001386e-06, "loss": 0.4197, "step": 1980 }, { "epoch": 0.11485960001154367, "grad_norm": 3.117021809901308, "learning_rate": 8.852970039831439e-06, "loss": 0.4147, "step": 1990 }, { "epoch": 0.11543678393119967, "grad_norm": 3.919723512318409, "learning_rate": 8.84719736766149e-06, "loss": 0.4414, "step": 2000 }, { "epoch": 0.11601396785085567, "grad_norm": 2.9342199285272796, "learning_rate": 8.841424695491543e-06, "loss": 0.427, "step": 2010 }, { "epoch": 0.11659115177051167, "grad_norm": 1.918258053028722, "learning_rate": 8.835652023321596e-06, "loss": 0.4381, "step": 2020 }, { "epoch": 0.11716833569016767, "grad_norm": 1.7751748211927454, "learning_rate": 8.82987935115165e-06, "loss": 0.4228, "step": 2030 }, { "epoch": 0.11774551960982367, "grad_norm": 2.2012455830534385, "learning_rate": 8.8241066789817e-06, "loss": 0.4209, "step": 2040 }, { "epoch": 0.11832270352947967, "grad_norm": 2.8237417214584544, "learning_rate": 8.818334006811754e-06, "loss": 0.4177, "step": 2050 }, { "epoch": 0.11889988744913567, "grad_norm": 4.6392306678295805, "learning_rate": 8.812561334641805e-06, "loss": 0.4087, "step": 2060 }, { "epoch": 0.11947707136879167, "grad_norm": 2.735063304359889, "learning_rate": 8.80678866247186e-06, "loss": 0.4272, "step": 2070 }, { "epoch": 0.12005425528844767, "grad_norm": 3.0299802933466657, "learning_rate": 8.801015990301911e-06, "loss": 0.4371, "step": 2080 }, { "epoch": 0.12063143920810367, "grad_norm": 5.124807390829953, "learning_rate": 8.795243318131964e-06, "loss": 0.4165, "step": 2090 }, { "epoch": 0.12120862312775967, "grad_norm": 5.1154449505887385, "learning_rate": 8.789470645962016e-06, "loss": 0.4346, "step": 2100 }, { "epoch": 0.12178580704741566, "grad_norm": 2.203947517060104, "learning_rate": 8.783697973792069e-06, "loss": 0.4225, "step": 2110 }, { "epoch": 0.12236299096707166, "grad_norm": 2.1610133845450683, "learning_rate": 8.777925301622122e-06, "loss": 0.4121, "step": 2120 }, { "epoch": 0.12294017488672765, "grad_norm": 13.953365114471524, "learning_rate": 8.772152629452175e-06, "loss": 0.4304, "step": 2130 }, { "epoch": 0.12351735880638365, "grad_norm": 8.125499458140256, "learning_rate": 8.766379957282226e-06, "loss": 0.4351, "step": 2140 }, { "epoch": 0.12409454272603965, "grad_norm": 10.554186301118584, "learning_rate": 8.760607285112279e-06, "loss": 0.4266, "step": 2150 }, { "epoch": 0.12467172664569565, "grad_norm": 4.270037728186706, "learning_rate": 8.754834612942332e-06, "loss": 0.438, "step": 2160 }, { "epoch": 0.12524891056535165, "grad_norm": 8.246386750659413, "learning_rate": 8.749061940772385e-06, "loss": 0.4283, "step": 2170 }, { "epoch": 0.12582609448500764, "grad_norm": 3.506946209841679, "learning_rate": 8.743289268602438e-06, "loss": 0.4106, "step": 2180 }, { "epoch": 0.12640327840466364, "grad_norm": 6.297671982703174, "learning_rate": 8.73751659643249e-06, "loss": 0.4329, "step": 2190 }, { "epoch": 0.12698046232431964, "grad_norm": 3.18530097096491, "learning_rate": 8.731743924262542e-06, "loss": 0.4185, "step": 2200 }, { "epoch": 0.12755764624397564, "grad_norm": 8.542498824615011, "learning_rate": 8.725971252092594e-06, "loss": 0.4255, "step": 2210 }, { "epoch": 0.12813483016363164, "grad_norm": 4.533535099713348, "learning_rate": 8.720198579922647e-06, "loss": 0.423, "step": 2220 }, { "epoch": 0.12871201408328764, "grad_norm": 3.889530684586912, "learning_rate": 8.7144259077527e-06, "loss": 0.4235, "step": 2230 }, { "epoch": 0.12928919800294364, "grad_norm": 3.6486853451512777, "learning_rate": 8.708653235582753e-06, "loss": 0.4211, "step": 2240 }, { "epoch": 0.12986638192259964, "grad_norm": 4.615918028596828, "learning_rate": 8.702880563412804e-06, "loss": 0.4088, "step": 2250 }, { "epoch": 0.13044356584225564, "grad_norm": 3.850082033721987, "learning_rate": 8.697107891242857e-06, "loss": 0.4282, "step": 2260 }, { "epoch": 0.13102074976191164, "grad_norm": 14.518871261878948, "learning_rate": 8.69133521907291e-06, "loss": 0.431, "step": 2270 }, { "epoch": 0.13159793368156764, "grad_norm": 3.5577521941063797, "learning_rate": 8.685562546902963e-06, "loss": 0.4269, "step": 2280 }, { "epoch": 0.13217511760122364, "grad_norm": 4.319284164265623, "learning_rate": 8.679789874733014e-06, "loss": 0.4151, "step": 2290 }, { "epoch": 0.13275230152087963, "grad_norm": 5.95977091224079, "learning_rate": 8.674017202563068e-06, "loss": 0.434, "step": 2300 }, { "epoch": 0.13332948544053563, "grad_norm": 5.717322004077494, "learning_rate": 8.668244530393119e-06, "loss": 0.412, "step": 2310 }, { "epoch": 0.13390666936019163, "grad_norm": 4.1757793177667315, "learning_rate": 8.662471858223172e-06, "loss": 0.4154, "step": 2320 }, { "epoch": 0.13448385327984763, "grad_norm": 2.4222311786523822, "learning_rate": 8.656699186053225e-06, "loss": 0.4075, "step": 2330 }, { "epoch": 0.13506103719950363, "grad_norm": 4.366879694604865, "learning_rate": 8.650926513883278e-06, "loss": 0.4212, "step": 2340 }, { "epoch": 0.13563822111915963, "grad_norm": 4.573113527280892, "learning_rate": 8.64515384171333e-06, "loss": 0.4235, "step": 2350 }, { "epoch": 0.13621540503881563, "grad_norm": 2.7183246240824612, "learning_rate": 8.639381169543382e-06, "loss": 0.4067, "step": 2360 }, { "epoch": 0.13679258895847163, "grad_norm": 2.858161612405957, "learning_rate": 8.633608497373435e-06, "loss": 0.404, "step": 2370 }, { "epoch": 0.13736977287812763, "grad_norm": 4.083715288704242, "learning_rate": 8.627835825203488e-06, "loss": 0.4063, "step": 2380 }, { "epoch": 0.13794695679778363, "grad_norm": 5.0777931108973595, "learning_rate": 8.62206315303354e-06, "loss": 0.422, "step": 2390 }, { "epoch": 0.13852414071743963, "grad_norm": 7.507110618573909, "learning_rate": 8.616290480863593e-06, "loss": 0.421, "step": 2400 }, { "epoch": 0.1391013246370956, "grad_norm": 5.833761997884338, "learning_rate": 8.610517808693644e-06, "loss": 0.4149, "step": 2410 }, { "epoch": 0.1396785085567516, "grad_norm": 2.9291582156343523, "learning_rate": 8.604745136523697e-06, "loss": 0.4223, "step": 2420 }, { "epoch": 0.1402556924764076, "grad_norm": 3.8527608900197245, "learning_rate": 8.59897246435375e-06, "loss": 0.4381, "step": 2430 }, { "epoch": 0.1408328763960636, "grad_norm": 5.009308450797531, "learning_rate": 8.593199792183803e-06, "loss": 0.4265, "step": 2440 }, { "epoch": 0.1414100603157196, "grad_norm": 4.305682778167611, "learning_rate": 8.587427120013854e-06, "loss": 0.422, "step": 2450 }, { "epoch": 0.1419872442353756, "grad_norm": 4.119426729774013, "learning_rate": 8.581654447843907e-06, "loss": 0.4079, "step": 2460 }, { "epoch": 0.1425644281550316, "grad_norm": 6.546817474930748, "learning_rate": 8.57588177567396e-06, "loss": 0.42, "step": 2470 }, { "epoch": 0.1431416120746876, "grad_norm": 3.668016204374388, "learning_rate": 8.570109103504013e-06, "loss": 0.4323, "step": 2480 }, { "epoch": 0.1437187959943436, "grad_norm": 4.577088984604785, "learning_rate": 8.564336431334065e-06, "loss": 0.412, "step": 2490 }, { "epoch": 0.1442959799139996, "grad_norm": 5.57584596543327, "learning_rate": 8.558563759164118e-06, "loss": 0.4263, "step": 2500 }, { "epoch": 0.1448731638336556, "grad_norm": 2.695769154250156, "learning_rate": 8.55279108699417e-06, "loss": 0.4107, "step": 2510 }, { "epoch": 0.1454503477533116, "grad_norm": 13.95443843109925, "learning_rate": 8.547018414824224e-06, "loss": 0.4213, "step": 2520 }, { "epoch": 0.1460275316729676, "grad_norm": 7.136056655011844, "learning_rate": 8.541245742654275e-06, "loss": 0.4274, "step": 2530 }, { "epoch": 0.14660471559262359, "grad_norm": 6.544753723165297, "learning_rate": 8.535473070484328e-06, "loss": 0.414, "step": 2540 }, { "epoch": 0.14718189951227958, "grad_norm": 5.397598920832979, "learning_rate": 8.52970039831438e-06, "loss": 0.4204, "step": 2550 }, { "epoch": 0.14775908343193558, "grad_norm": 4.756636071432368, "learning_rate": 8.523927726144432e-06, "loss": 0.4124, "step": 2560 }, { "epoch": 0.14833626735159158, "grad_norm": 14.516917457987418, "learning_rate": 8.518155053974486e-06, "loss": 0.4287, "step": 2570 }, { "epoch": 0.14891345127124758, "grad_norm": 8.042684749735274, "learning_rate": 8.512382381804539e-06, "loss": 0.399, "step": 2580 }, { "epoch": 0.14949063519090358, "grad_norm": 7.4003473379136775, "learning_rate": 8.50660970963459e-06, "loss": 0.4113, "step": 2590 }, { "epoch": 0.15006781911055958, "grad_norm": 3.771759855408406, "learning_rate": 8.500837037464643e-06, "loss": 0.4056, "step": 2600 }, { "epoch": 0.15064500303021558, "grad_norm": 3.6286193403799682, "learning_rate": 8.495064365294696e-06, "loss": 0.4246, "step": 2610 }, { "epoch": 0.15122218694987158, "grad_norm": 3.863418428458885, "learning_rate": 8.489291693124749e-06, "loss": 0.4085, "step": 2620 }, { "epoch": 0.15179937086952758, "grad_norm": 4.263367210964064, "learning_rate": 8.4835190209548e-06, "loss": 0.4223, "step": 2630 }, { "epoch": 0.15237655478918358, "grad_norm": 3.337620340539389, "learning_rate": 8.477746348784853e-06, "loss": 0.4175, "step": 2640 }, { "epoch": 0.15295373870883958, "grad_norm": 3.0267657042390788, "learning_rate": 8.471973676614905e-06, "loss": 0.4113, "step": 2650 }, { "epoch": 0.15353092262849558, "grad_norm": 2.6106010567447893, "learning_rate": 8.466201004444958e-06, "loss": 0.4157, "step": 2660 }, { "epoch": 0.15410810654815157, "grad_norm": 2.264152062153991, "learning_rate": 8.46042833227501e-06, "loss": 0.4338, "step": 2670 }, { "epoch": 0.15468529046780757, "grad_norm": 1.9215310401757064, "learning_rate": 8.454655660105064e-06, "loss": 0.4114, "step": 2680 }, { "epoch": 0.15526247438746357, "grad_norm": 2.615154051211967, "learning_rate": 8.448882987935117e-06, "loss": 0.4216, "step": 2690 }, { "epoch": 0.15583965830711957, "grad_norm": 2.491915544143726, "learning_rate": 8.443110315765168e-06, "loss": 0.411, "step": 2700 }, { "epoch": 0.15641684222677557, "grad_norm": 10.894040933901527, "learning_rate": 8.437337643595221e-06, "loss": 0.43, "step": 2710 }, { "epoch": 0.15699402614643157, "grad_norm": 7.186343591522965, "learning_rate": 8.431564971425274e-06, "loss": 0.4119, "step": 2720 }, { "epoch": 0.15757121006608757, "grad_norm": 11.15516113836552, "learning_rate": 8.425792299255327e-06, "loss": 0.4206, "step": 2730 }, { "epoch": 0.15814839398574357, "grad_norm": 3.3849670049000626, "learning_rate": 8.420019627085378e-06, "loss": 0.4098, "step": 2740 }, { "epoch": 0.15872557790539957, "grad_norm": 2.3478329302680176, "learning_rate": 8.414246954915431e-06, "loss": 0.4358, "step": 2750 }, { "epoch": 0.15930276182505557, "grad_norm": 5.368826342998146, "learning_rate": 8.408474282745483e-06, "loss": 0.4114, "step": 2760 }, { "epoch": 0.15987994574471157, "grad_norm": 2.778388386877281, "learning_rate": 8.402701610575536e-06, "loss": 0.4149, "step": 2770 }, { "epoch": 0.16045712966436754, "grad_norm": 6.269723327157733, "learning_rate": 8.396928938405589e-06, "loss": 0.4059, "step": 2780 }, { "epoch": 0.16103431358402354, "grad_norm": 3.51009855789622, "learning_rate": 8.391156266235642e-06, "loss": 0.4073, "step": 2790 }, { "epoch": 0.16161149750367954, "grad_norm": 2.84216423707538, "learning_rate": 8.385383594065693e-06, "loss": 0.4249, "step": 2800 }, { "epoch": 0.16218868142333553, "grad_norm": 4.458775881028344, "learning_rate": 8.379610921895746e-06, "loss": 0.4248, "step": 2810 }, { "epoch": 0.16276586534299153, "grad_norm": 4.7717011271745875, "learning_rate": 8.3738382497258e-06, "loss": 0.4031, "step": 2820 }, { "epoch": 0.16334304926264753, "grad_norm": 2.339306133151702, "learning_rate": 8.368065577555852e-06, "loss": 0.4089, "step": 2830 }, { "epoch": 0.16392023318230353, "grad_norm": 2.242140863987944, "learning_rate": 8.362292905385904e-06, "loss": 0.4183, "step": 2840 }, { "epoch": 0.16449741710195953, "grad_norm": 2.50085114867505, "learning_rate": 8.356520233215957e-06, "loss": 0.4209, "step": 2850 }, { "epoch": 0.16507460102161553, "grad_norm": 2.426762382327375, "learning_rate": 8.350747561046008e-06, "loss": 0.4009, "step": 2860 }, { "epoch": 0.16565178494127153, "grad_norm": 3.535990085619889, "learning_rate": 8.344974888876063e-06, "loss": 0.4145, "step": 2870 }, { "epoch": 0.16622896886092753, "grad_norm": 1.9434196504603485, "learning_rate": 8.339202216706114e-06, "loss": 0.4029, "step": 2880 }, { "epoch": 0.16680615278058353, "grad_norm": 3.647575465250371, "learning_rate": 8.333429544536167e-06, "loss": 0.4165, "step": 2890 }, { "epoch": 0.16738333670023953, "grad_norm": 3.21796325121302, "learning_rate": 8.327656872366218e-06, "loss": 0.419, "step": 2900 }, { "epoch": 0.16796052061989553, "grad_norm": 2.49775247695972, "learning_rate": 8.321884200196271e-06, "loss": 0.4179, "step": 2910 }, { "epoch": 0.16853770453955152, "grad_norm": 1.992198339323461, "learning_rate": 8.316111528026324e-06, "loss": 0.4062, "step": 2920 }, { "epoch": 0.16911488845920752, "grad_norm": 2.82923364634973, "learning_rate": 8.310338855856377e-06, "loss": 0.3997, "step": 2930 }, { "epoch": 0.16969207237886352, "grad_norm": 2.414622342801528, "learning_rate": 8.304566183686429e-06, "loss": 0.3978, "step": 2940 }, { "epoch": 0.17026925629851952, "grad_norm": 3.500929543134566, "learning_rate": 8.298793511516482e-06, "loss": 0.4214, "step": 2950 }, { "epoch": 0.17084644021817552, "grad_norm": 4.533770301559384, "learning_rate": 8.293020839346535e-06, "loss": 0.4003, "step": 2960 }, { "epoch": 0.17142362413783152, "grad_norm": 3.463538477874354, "learning_rate": 8.287248167176588e-06, "loss": 0.4049, "step": 2970 }, { "epoch": 0.17200080805748752, "grad_norm": 2.6643523986040023, "learning_rate": 8.281475495006639e-06, "loss": 0.4157, "step": 2980 }, { "epoch": 0.17257799197714352, "grad_norm": 5.1202436242415175, "learning_rate": 8.275702822836692e-06, "loss": 0.4119, "step": 2990 }, { "epoch": 0.17315517589679952, "grad_norm": 4.375642688680107, "learning_rate": 8.269930150666743e-06, "loss": 0.4241, "step": 3000 }, { "epoch": 0.17373235981645552, "grad_norm": 13.184415266130015, "learning_rate": 8.264157478496796e-06, "loss": 0.3931, "step": 3010 }, { "epoch": 0.17430954373611152, "grad_norm": 3.5119329436961872, "learning_rate": 8.25838480632685e-06, "loss": 0.4332, "step": 3020 }, { "epoch": 0.17488672765576752, "grad_norm": 2.057788212146213, "learning_rate": 8.252612134156902e-06, "loss": 0.4026, "step": 3030 }, { "epoch": 0.17546391157542351, "grad_norm": 2.4025914280540293, "learning_rate": 8.246839461986954e-06, "loss": 0.4297, "step": 3040 }, { "epoch": 0.1760410954950795, "grad_norm": 2.3273476646183187, "learning_rate": 8.241066789817007e-06, "loss": 0.4256, "step": 3050 }, { "epoch": 0.1766182794147355, "grad_norm": 3.968034409940849, "learning_rate": 8.23529411764706e-06, "loss": 0.3911, "step": 3060 }, { "epoch": 0.1771954633343915, "grad_norm": 3.5987187085577648, "learning_rate": 8.229521445477113e-06, "loss": 0.4137, "step": 3070 }, { "epoch": 0.1777726472540475, "grad_norm": 1.6685524653169845, "learning_rate": 8.223748773307164e-06, "loss": 0.4115, "step": 3080 }, { "epoch": 0.1783498311737035, "grad_norm": 15.206856747451255, "learning_rate": 8.217976101137217e-06, "loss": 0.3993, "step": 3090 }, { "epoch": 0.1789270150933595, "grad_norm": 3.2248196017828685, "learning_rate": 8.212203428967268e-06, "loss": 0.4263, "step": 3100 }, { "epoch": 0.1795041990130155, "grad_norm": 5.06230398614513, "learning_rate": 8.206430756797322e-06, "loss": 0.4241, "step": 3110 }, { "epoch": 0.1800813829326715, "grad_norm": 24.83780832952265, "learning_rate": 8.200658084627375e-06, "loss": 0.4139, "step": 3120 }, { "epoch": 0.1806585668523275, "grad_norm": 2.8321722632943094, "learning_rate": 8.194885412457428e-06, "loss": 0.4233, "step": 3130 }, { "epoch": 0.1812357507719835, "grad_norm": 3.0260534676681727, "learning_rate": 8.189112740287479e-06, "loss": 0.4142, "step": 3140 }, { "epoch": 0.18181293469163948, "grad_norm": 4.68099176672326, "learning_rate": 8.183340068117532e-06, "loss": 0.4118, "step": 3150 }, { "epoch": 0.18239011861129548, "grad_norm": 11.310308197618612, "learning_rate": 8.177567395947585e-06, "loss": 0.415, "step": 3160 }, { "epoch": 0.18296730253095148, "grad_norm": 4.572818458115979, "learning_rate": 8.171794723777638e-06, "loss": 0.4015, "step": 3170 }, { "epoch": 0.18354448645060747, "grad_norm": 6.450421868687323, "learning_rate": 8.16602205160769e-06, "loss": 0.4263, "step": 3180 }, { "epoch": 0.18412167037026347, "grad_norm": 3.4296934325228263, "learning_rate": 8.160249379437742e-06, "loss": 0.4124, "step": 3190 }, { "epoch": 0.18469885428991947, "grad_norm": 9.959813392971029, "learning_rate": 8.154476707267794e-06, "loss": 0.4015, "step": 3200 }, { "epoch": 0.18527603820957547, "grad_norm": 3.3029855110594695, "learning_rate": 8.148704035097847e-06, "loss": 0.4009, "step": 3210 }, { "epoch": 0.18585322212923147, "grad_norm": 2.8097447076161273, "learning_rate": 8.142931362927901e-06, "loss": 0.4002, "step": 3220 }, { "epoch": 0.18643040604888747, "grad_norm": 34.98571611934199, "learning_rate": 8.137158690757953e-06, "loss": 0.405, "step": 3230 }, { "epoch": 0.18700758996854347, "grad_norm": 9.148426684066074, "learning_rate": 8.131386018588006e-06, "loss": 0.4044, "step": 3240 }, { "epoch": 0.18758477388819947, "grad_norm": 4.173548086962396, "learning_rate": 8.125613346418057e-06, "loss": 0.4217, "step": 3250 }, { "epoch": 0.18816195780785547, "grad_norm": 53.37411486094812, "learning_rate": 8.11984067424811e-06, "loss": 0.4138, "step": 3260 }, { "epoch": 0.18873914172751147, "grad_norm": 2.3480379802050733, "learning_rate": 8.114068002078163e-06, "loss": 0.4033, "step": 3270 }, { "epoch": 0.18931632564716747, "grad_norm": 2.221291345204337, "learning_rate": 8.108295329908216e-06, "loss": 0.3914, "step": 3280 }, { "epoch": 0.18989350956682347, "grad_norm": 4.990540206022043, "learning_rate": 8.102522657738267e-06, "loss": 0.3985, "step": 3290 }, { "epoch": 0.19047069348647946, "grad_norm": 4.025051683089965, "learning_rate": 8.09674998556832e-06, "loss": 0.415, "step": 3300 }, { "epoch": 0.19104787740613546, "grad_norm": 2.5251837331957607, "learning_rate": 8.090977313398373e-06, "loss": 0.4133, "step": 3310 }, { "epoch": 0.19162506132579146, "grad_norm": 2.380391015882718, "learning_rate": 8.085204641228426e-06, "loss": 0.3926, "step": 3320 }, { "epoch": 0.19220224524544746, "grad_norm": 4.41017165770354, "learning_rate": 8.079431969058478e-06, "loss": 0.4211, "step": 3330 }, { "epoch": 0.19277942916510346, "grad_norm": 2.3063890276035846, "learning_rate": 8.07365929688853e-06, "loss": 0.4138, "step": 3340 }, { "epoch": 0.19335661308475946, "grad_norm": 4.271620366325266, "learning_rate": 8.067886624718582e-06, "loss": 0.4093, "step": 3350 }, { "epoch": 0.19393379700441546, "grad_norm": 32.70807377417423, "learning_rate": 8.062113952548635e-06, "loss": 0.4035, "step": 3360 }, { "epoch": 0.19451098092407146, "grad_norm": 2.4018179935448276, "learning_rate": 8.056341280378688e-06, "loss": 0.4126, "step": 3370 }, { "epoch": 0.19508816484372746, "grad_norm": 3.1209658711505552, "learning_rate": 8.050568608208741e-06, "loss": 0.4073, "step": 3380 }, { "epoch": 0.19566534876338346, "grad_norm": 3.6351629831980024, "learning_rate": 8.044795936038793e-06, "loss": 0.4041, "step": 3390 }, { "epoch": 0.19624253268303946, "grad_norm": 3.652368336229329, "learning_rate": 8.039023263868846e-06, "loss": 0.4112, "step": 3400 }, { "epoch": 0.19681971660269545, "grad_norm": 2.984404633116569, "learning_rate": 8.033250591698899e-06, "loss": 0.4134, "step": 3410 }, { "epoch": 0.19739690052235145, "grad_norm": 2.4526774932392024, "learning_rate": 8.027477919528952e-06, "loss": 0.4011, "step": 3420 }, { "epoch": 0.19797408444200745, "grad_norm": 4.571072305725846, "learning_rate": 8.021705247359003e-06, "loss": 0.4097, "step": 3430 }, { "epoch": 0.19855126836166345, "grad_norm": 2.6003306321869895, "learning_rate": 8.015932575189056e-06, "loss": 0.406, "step": 3440 }, { "epoch": 0.19912845228131945, "grad_norm": 4.171122512202554, "learning_rate": 8.010159903019107e-06, "loss": 0.4083, "step": 3450 }, { "epoch": 0.19970563620097545, "grad_norm": 3.2250910237655006, "learning_rate": 8.00438723084916e-06, "loss": 0.415, "step": 3460 }, { "epoch": 0.20028282012063145, "grad_norm": 4.871724300699273, "learning_rate": 7.998614558679213e-06, "loss": 0.4296, "step": 3470 }, { "epoch": 0.20086000404028745, "grad_norm": 2.9738579217002887, "learning_rate": 7.992841886509266e-06, "loss": 0.4109, "step": 3480 }, { "epoch": 0.20143718795994345, "grad_norm": 5.488557624180365, "learning_rate": 7.987069214339318e-06, "loss": 0.4203, "step": 3490 }, { "epoch": 0.20201437187959945, "grad_norm": 6.30148557707432, "learning_rate": 7.98129654216937e-06, "loss": 0.4166, "step": 3500 }, { "epoch": 0.20259155579925545, "grad_norm": 4.159946548536443, "learning_rate": 7.975523869999424e-06, "loss": 0.4126, "step": 3510 }, { "epoch": 0.20316873971891142, "grad_norm": 15.334889223997754, "learning_rate": 7.969751197829477e-06, "loss": 0.3947, "step": 3520 }, { "epoch": 0.20374592363856742, "grad_norm": 3.025285897677573, "learning_rate": 7.963978525659528e-06, "loss": 0.403, "step": 3530 }, { "epoch": 0.20432310755822342, "grad_norm": 4.262956488693212, "learning_rate": 7.958205853489581e-06, "loss": 0.4066, "step": 3540 }, { "epoch": 0.20490029147787941, "grad_norm": 2.5780055567960143, "learning_rate": 7.952433181319632e-06, "loss": 0.4032, "step": 3550 }, { "epoch": 0.20547747539753541, "grad_norm": 4.956342345823864, "learning_rate": 7.946660509149685e-06, "loss": 0.4022, "step": 3560 }, { "epoch": 0.2060546593171914, "grad_norm": 3.89966793053596, "learning_rate": 7.940887836979738e-06, "loss": 0.4059, "step": 3570 }, { "epoch": 0.2066318432368474, "grad_norm": 2.598383914711163, "learning_rate": 7.935115164809791e-06, "loss": 0.3988, "step": 3580 }, { "epoch": 0.2072090271565034, "grad_norm": 3.673402619499888, "learning_rate": 7.929342492639843e-06, "loss": 0.4117, "step": 3590 }, { "epoch": 0.2077862110761594, "grad_norm": 5.7308049694271235, "learning_rate": 7.923569820469896e-06, "loss": 0.4008, "step": 3600 }, { "epoch": 0.2083633949958154, "grad_norm": 7.935773567013425, "learning_rate": 7.917797148299949e-06, "loss": 0.3877, "step": 3610 }, { "epoch": 0.2089405789154714, "grad_norm": 6.009048915243287, "learning_rate": 7.912024476130002e-06, "loss": 0.4065, "step": 3620 }, { "epoch": 0.2095177628351274, "grad_norm": 2.95995624359623, "learning_rate": 7.906251803960053e-06, "loss": 0.401, "step": 3630 }, { "epoch": 0.2100949467547834, "grad_norm": 3.1037746592103255, "learning_rate": 7.900479131790106e-06, "loss": 0.4013, "step": 3640 }, { "epoch": 0.2106721306744394, "grad_norm": 6.029705929267264, "learning_rate": 7.894706459620158e-06, "loss": 0.4085, "step": 3650 }, { "epoch": 0.2112493145940954, "grad_norm": 5.81081103172759, "learning_rate": 7.888933787450212e-06, "loss": 0.4062, "step": 3660 }, { "epoch": 0.2118264985137514, "grad_norm": 6.6915581684773935, "learning_rate": 7.883161115280264e-06, "loss": 0.4146, "step": 3670 }, { "epoch": 0.2124036824334074, "grad_norm": 4.483451936143915, "learning_rate": 7.877388443110317e-06, "loss": 0.3964, "step": 3680 }, { "epoch": 0.2129808663530634, "grad_norm": 4.847478023953505, "learning_rate": 7.871615770940368e-06, "loss": 0.3968, "step": 3690 }, { "epoch": 0.2135580502727194, "grad_norm": 4.734123259756348, "learning_rate": 7.865843098770421e-06, "loss": 0.411, "step": 3700 }, { "epoch": 0.2141352341923754, "grad_norm": 67.93482554004888, "learning_rate": 7.860070426600474e-06, "loss": 0.4171, "step": 3710 }, { "epoch": 0.2147124181120314, "grad_norm": 4.427723281232474, "learning_rate": 7.854297754430527e-06, "loss": 0.3901, "step": 3720 }, { "epoch": 0.2152896020316874, "grad_norm": 5.970970375519135, "learning_rate": 7.848525082260578e-06, "loss": 0.392, "step": 3730 }, { "epoch": 0.2158667859513434, "grad_norm": 3.364616367304386, "learning_rate": 7.842752410090631e-06, "loss": 0.4262, "step": 3740 }, { "epoch": 0.2164439698709994, "grad_norm": 14.709875574791262, "learning_rate": 7.836979737920684e-06, "loss": 0.4189, "step": 3750 }, { "epoch": 0.2170211537906554, "grad_norm": 34.85728037006575, "learning_rate": 7.831207065750737e-06, "loss": 0.4281, "step": 3760 }, { "epoch": 0.2175983377103114, "grad_norm": 3.8349995808560835, "learning_rate": 7.82543439358079e-06, "loss": 0.4197, "step": 3770 }, { "epoch": 0.2181755216299674, "grad_norm": 9.826012532840847, "learning_rate": 7.819661721410842e-06, "loss": 0.411, "step": 3780 }, { "epoch": 0.2187527055496234, "grad_norm": 5.148779185873613, "learning_rate": 7.813889049240895e-06, "loss": 0.408, "step": 3790 }, { "epoch": 0.2193298894692794, "grad_norm": 6.034690169401536, "learning_rate": 7.808116377070946e-06, "loss": 0.389, "step": 3800 }, { "epoch": 0.2199070733889354, "grad_norm": 7.98093273312519, "learning_rate": 7.802343704900999e-06, "loss": 0.4087, "step": 3810 }, { "epoch": 0.2204842573085914, "grad_norm": 20.795705430196968, "learning_rate": 7.796571032731052e-06, "loss": 0.4164, "step": 3820 }, { "epoch": 0.2210614412282474, "grad_norm": 4.546394166434191, "learning_rate": 7.790798360561105e-06, "loss": 0.4198, "step": 3830 }, { "epoch": 0.2216386251479034, "grad_norm": 2.8073314189395457, "learning_rate": 7.785025688391156e-06, "loss": 0.4201, "step": 3840 }, { "epoch": 0.2222158090675594, "grad_norm": 24.430024193972454, "learning_rate": 7.77925301622121e-06, "loss": 0.3916, "step": 3850 }, { "epoch": 0.2227929929872154, "grad_norm": 3.7314691801815525, "learning_rate": 7.773480344051262e-06, "loss": 0.4034, "step": 3860 }, { "epoch": 0.2233701769068714, "grad_norm": 4.463974565740604, "learning_rate": 7.767707671881316e-06, "loss": 0.4177, "step": 3870 }, { "epoch": 0.2239473608265274, "grad_norm": 3.935819483400635, "learning_rate": 7.761934999711367e-06, "loss": 0.4186, "step": 3880 }, { "epoch": 0.22452454474618336, "grad_norm": 2.7465902804726343, "learning_rate": 7.75616232754142e-06, "loss": 0.4355, "step": 3890 }, { "epoch": 0.22510172866583936, "grad_norm": 2.59329896630652, "learning_rate": 7.750389655371471e-06, "loss": 0.4139, "step": 3900 }, { "epoch": 0.22567891258549536, "grad_norm": 2.8540002686300214, "learning_rate": 7.744616983201524e-06, "loss": 0.4062, "step": 3910 }, { "epoch": 0.22625609650515136, "grad_norm": 6.48349060337823, "learning_rate": 7.738844311031577e-06, "loss": 0.4002, "step": 3920 }, { "epoch": 0.22683328042480735, "grad_norm": 4.668616924928805, "learning_rate": 7.73307163886163e-06, "loss": 0.4041, "step": 3930 }, { "epoch": 0.22741046434446335, "grad_norm": 6.539752215401261, "learning_rate": 7.727298966691682e-06, "loss": 0.3994, "step": 3940 }, { "epoch": 0.22798764826411935, "grad_norm": 3.3595541678536156, "learning_rate": 7.721526294521735e-06, "loss": 0.3995, "step": 3950 }, { "epoch": 0.22856483218377535, "grad_norm": 2.8441727594832886, "learning_rate": 7.715753622351788e-06, "loss": 0.4, "step": 3960 }, { "epoch": 0.22914201610343135, "grad_norm": 2.849353128300574, "learning_rate": 7.70998095018184e-06, "loss": 0.4048, "step": 3970 }, { "epoch": 0.22971920002308735, "grad_norm": 4.881214366450045, "learning_rate": 7.704208278011892e-06, "loss": 0.4054, "step": 3980 }, { "epoch": 0.23029638394274335, "grad_norm": 15.46486836243805, "learning_rate": 7.698435605841945e-06, "loss": 0.3923, "step": 3990 }, { "epoch": 0.23087356786239935, "grad_norm": 3.847585477525563, "learning_rate": 7.692662933671996e-06, "loss": 0.393, "step": 4000 }, { "epoch": 0.23145075178205535, "grad_norm": 38.3399434241166, "learning_rate": 7.686890261502051e-06, "loss": 0.3926, "step": 4010 }, { "epoch": 0.23202793570171135, "grad_norm": 2.6580858390650466, "learning_rate": 7.681117589332102e-06, "loss": 0.3948, "step": 4020 }, { "epoch": 0.23260511962136735, "grad_norm": 4.466615319467433, "learning_rate": 7.675344917162155e-06, "loss": 0.402, "step": 4030 }, { "epoch": 0.23318230354102334, "grad_norm": 6.7414838396917505, "learning_rate": 7.669572244992207e-06, "loss": 0.3904, "step": 4040 }, { "epoch": 0.23375948746067934, "grad_norm": 4.03997703346681, "learning_rate": 7.66379957282226e-06, "loss": 0.3938, "step": 4050 }, { "epoch": 0.23433667138033534, "grad_norm": 4.258742526842458, "learning_rate": 7.658026900652313e-06, "loss": 0.3998, "step": 4060 }, { "epoch": 0.23491385529999134, "grad_norm": 5.100923170543839, "learning_rate": 7.652254228482366e-06, "loss": 0.3968, "step": 4070 }, { "epoch": 0.23549103921964734, "grad_norm": 8.332021271422962, "learning_rate": 7.646481556312417e-06, "loss": 0.4017, "step": 4080 }, { "epoch": 0.23606822313930334, "grad_norm": 5.001487618559788, "learning_rate": 7.64070888414247e-06, "loss": 0.3929, "step": 4090 }, { "epoch": 0.23664540705895934, "grad_norm": 6.605470741420995, "learning_rate": 7.634936211972523e-06, "loss": 0.3995, "step": 4100 }, { "epoch": 0.23722259097861534, "grad_norm": 4.352594377363156, "learning_rate": 7.629163539802575e-06, "loss": 0.4008, "step": 4110 }, { "epoch": 0.23779977489827134, "grad_norm": 8.143604964743357, "learning_rate": 7.6233908676326275e-06, "loss": 0.3987, "step": 4120 }, { "epoch": 0.23837695881792734, "grad_norm": 3.9869800783007427, "learning_rate": 7.6176181954626805e-06, "loss": 0.3874, "step": 4130 }, { "epoch": 0.23895414273758334, "grad_norm": 3.4272782347037207, "learning_rate": 7.611845523292733e-06, "loss": 0.3869, "step": 4140 }, { "epoch": 0.23953132665723934, "grad_norm": 2.964033904850173, "learning_rate": 7.606072851122786e-06, "loss": 0.3962, "step": 4150 }, { "epoch": 0.24010851057689533, "grad_norm": 55.68370603421082, "learning_rate": 7.600300178952838e-06, "loss": 0.3981, "step": 4160 }, { "epoch": 0.24068569449655133, "grad_norm": 4.402682573412378, "learning_rate": 7.594527506782891e-06, "loss": 0.4152, "step": 4170 }, { "epoch": 0.24126287841620733, "grad_norm": 2.8760794787596997, "learning_rate": 7.588754834612942e-06, "loss": 0.3881, "step": 4180 }, { "epoch": 0.24184006233586333, "grad_norm": 2.0239240283122575, "learning_rate": 7.582982162442995e-06, "loss": 0.4023, "step": 4190 }, { "epoch": 0.24241724625551933, "grad_norm": 2.5930742840295986, "learning_rate": 7.577209490273047e-06, "loss": 0.4146, "step": 4200 }, { "epoch": 0.24299443017517533, "grad_norm": 12.25806910583576, "learning_rate": 7.5714368181031e-06, "loss": 0.4268, "step": 4210 }, { "epoch": 0.24357161409483133, "grad_norm": 9.121606156996025, "learning_rate": 7.565664145933153e-06, "loss": 0.3926, "step": 4220 }, { "epoch": 0.24414879801448733, "grad_norm": 9.12741140098973, "learning_rate": 7.559891473763206e-06, "loss": 0.3908, "step": 4230 }, { "epoch": 0.24472598193414333, "grad_norm": 2.988966863298224, "learning_rate": 7.554118801593258e-06, "loss": 0.4077, "step": 4240 }, { "epoch": 0.24530316585379933, "grad_norm": 4.755614024652895, "learning_rate": 7.548346129423311e-06, "loss": 0.3923, "step": 4250 }, { "epoch": 0.2458803497734553, "grad_norm": 3.286123151497483, "learning_rate": 7.542573457253363e-06, "loss": 0.3842, "step": 4260 }, { "epoch": 0.2464575336931113, "grad_norm": 24.75179268960897, "learning_rate": 7.536800785083416e-06, "loss": 0.4348, "step": 4270 }, { "epoch": 0.2470347176127673, "grad_norm": 2.1639276020923064, "learning_rate": 7.531028112913469e-06, "loss": 0.4064, "step": 4280 }, { "epoch": 0.2476119015324233, "grad_norm": 1.9508786971501029, "learning_rate": 7.525255440743521e-06, "loss": 0.4085, "step": 4290 }, { "epoch": 0.2481890854520793, "grad_norm": 1.8580731020746826, "learning_rate": 7.519482768573574e-06, "loss": 0.4111, "step": 4300 }, { "epoch": 0.2487662693717353, "grad_norm": 2.6928452549457407, "learning_rate": 7.5137100964036255e-06, "loss": 0.4115, "step": 4310 }, { "epoch": 0.2493434532913913, "grad_norm": 2.054083128359146, "learning_rate": 7.5079374242336786e-06, "loss": 0.4121, "step": 4320 }, { "epoch": 0.2499206372110473, "grad_norm": 2.6316000209561663, "learning_rate": 7.502164752063731e-06, "loss": 0.4186, "step": 4330 }, { "epoch": 0.2504978211307033, "grad_norm": 1.8771833797620388, "learning_rate": 7.496392079893784e-06, "loss": 0.4001, "step": 4340 }, { "epoch": 0.2510750050503593, "grad_norm": 9.221358255158016, "learning_rate": 7.490619407723836e-06, "loss": 0.4177, "step": 4350 }, { "epoch": 0.2516521889700153, "grad_norm": 1.8539444504213582, "learning_rate": 7.484846735553889e-06, "loss": 0.4055, "step": 4360 }, { "epoch": 0.2522293728896713, "grad_norm": 2.284022523834839, "learning_rate": 7.479074063383941e-06, "loss": 0.4228, "step": 4370 }, { "epoch": 0.2528065568093273, "grad_norm": 1.8613795057409426, "learning_rate": 7.473301391213994e-06, "loss": 0.3886, "step": 4380 }, { "epoch": 0.2533837407289833, "grad_norm": 3.7779942998503855, "learning_rate": 7.467528719044046e-06, "loss": 0.4158, "step": 4390 }, { "epoch": 0.2539609246486393, "grad_norm": 3.7840065627156365, "learning_rate": 7.461756046874099e-06, "loss": 0.4187, "step": 4400 }, { "epoch": 0.2545381085682953, "grad_norm": 5.099586666363089, "learning_rate": 7.455983374704151e-06, "loss": 0.4098, "step": 4410 }, { "epoch": 0.2551152924879513, "grad_norm": 2.6870891278248337, "learning_rate": 7.450210702534204e-06, "loss": 0.3986, "step": 4420 }, { "epoch": 0.2556924764076073, "grad_norm": 2.7248868412027583, "learning_rate": 7.444438030364256e-06, "loss": 0.4038, "step": 4430 }, { "epoch": 0.2562696603272633, "grad_norm": 3.714403433710303, "learning_rate": 7.438665358194309e-06, "loss": 0.4045, "step": 4440 }, { "epoch": 0.2568468442469193, "grad_norm": 2.9240340589059644, "learning_rate": 7.432892686024361e-06, "loss": 0.3911, "step": 4450 }, { "epoch": 0.2574240281665753, "grad_norm": 4.331854988527969, "learning_rate": 7.427120013854414e-06, "loss": 0.394, "step": 4460 }, { "epoch": 0.2580012120862313, "grad_norm": 6.84340943547103, "learning_rate": 7.421347341684466e-06, "loss": 0.3884, "step": 4470 }, { "epoch": 0.2585783960058873, "grad_norm": 26.71610826157837, "learning_rate": 7.415574669514519e-06, "loss": 0.4126, "step": 4480 }, { "epoch": 0.2591555799255433, "grad_norm": 3.691538028091923, "learning_rate": 7.4098019973445714e-06, "loss": 0.3997, "step": 4490 }, { "epoch": 0.2597327638451993, "grad_norm": 3.6466304992174527, "learning_rate": 7.4040293251746245e-06, "loss": 0.387, "step": 4500 }, { "epoch": 0.2603099477648553, "grad_norm": 3.069337367045407, "learning_rate": 7.398256653004677e-06, "loss": 0.3925, "step": 4510 }, { "epoch": 0.2608871316845113, "grad_norm": 17.7941278304272, "learning_rate": 7.39248398083473e-06, "loss": 0.3904, "step": 4520 }, { "epoch": 0.2614643156041673, "grad_norm": 4.010302222583594, "learning_rate": 7.386711308664781e-06, "loss": 0.4, "step": 4530 }, { "epoch": 0.2620414995238233, "grad_norm": 2.891669458141575, "learning_rate": 7.380938636494834e-06, "loss": 0.3916, "step": 4540 }, { "epoch": 0.2626186834434793, "grad_norm": 3.5781925847929736, "learning_rate": 7.375165964324886e-06, "loss": 0.3901, "step": 4550 }, { "epoch": 0.26319586736313527, "grad_norm": 2.8217409668695814, "learning_rate": 7.369393292154939e-06, "loss": 0.41, "step": 4560 }, { "epoch": 0.26377305128279127, "grad_norm": 4.326569507185014, "learning_rate": 7.363620619984991e-06, "loss": 0.4091, "step": 4570 }, { "epoch": 0.26435023520244727, "grad_norm": 5.515247686980751, "learning_rate": 7.357847947815044e-06, "loss": 0.4203, "step": 4580 }, { "epoch": 0.26492741912210327, "grad_norm": 6.561612435080219, "learning_rate": 7.3520752756450966e-06, "loss": 0.3951, "step": 4590 }, { "epoch": 0.26550460304175927, "grad_norm": 2.241772546310698, "learning_rate": 7.34630260347515e-06, "loss": 0.3985, "step": 4600 }, { "epoch": 0.26608178696141527, "grad_norm": 2.524827575292416, "learning_rate": 7.340529931305202e-06, "loss": 0.3981, "step": 4610 }, { "epoch": 0.26665897088107127, "grad_norm": 2.4686565154848106, "learning_rate": 7.334757259135255e-06, "loss": 0.3948, "step": 4620 }, { "epoch": 0.26723615480072727, "grad_norm": 5.055286394352697, "learning_rate": 7.328984586965306e-06, "loss": 0.3966, "step": 4630 }, { "epoch": 0.26781333872038326, "grad_norm": 2.4105713306719023, "learning_rate": 7.323211914795359e-06, "loss": 0.3896, "step": 4640 }, { "epoch": 0.26839052264003926, "grad_norm": 8.884358381031186, "learning_rate": 7.317439242625411e-06, "loss": 0.4043, "step": 4650 }, { "epoch": 0.26896770655969526, "grad_norm": 3.3786577911171465, "learning_rate": 7.311666570455464e-06, "loss": 0.4087, "step": 4660 }, { "epoch": 0.26954489047935126, "grad_norm": 2.241166004757874, "learning_rate": 7.3058938982855165e-06, "loss": 0.3961, "step": 4670 }, { "epoch": 0.27012207439900726, "grad_norm": 3.7398767727731195, "learning_rate": 7.3001212261155695e-06, "loss": 0.4025, "step": 4680 }, { "epoch": 0.27069925831866326, "grad_norm": 2.4627601331961024, "learning_rate": 7.294348553945622e-06, "loss": 0.3967, "step": 4690 }, { "epoch": 0.27127644223831926, "grad_norm": 2.9400014965222243, "learning_rate": 7.288575881775675e-06, "loss": 0.398, "step": 4700 }, { "epoch": 0.27185362615797526, "grad_norm": 2.371642161881622, "learning_rate": 7.282803209605727e-06, "loss": 0.3878, "step": 4710 }, { "epoch": 0.27243081007763126, "grad_norm": 2.1217448647861943, "learning_rate": 7.27703053743578e-06, "loss": 0.3961, "step": 4720 }, { "epoch": 0.27300799399728726, "grad_norm": 6.480519075871927, "learning_rate": 7.271257865265832e-06, "loss": 0.3904, "step": 4730 }, { "epoch": 0.27358517791694326, "grad_norm": 3.7782562668503292, "learning_rate": 7.265485193095885e-06, "loss": 0.4015, "step": 4740 }, { "epoch": 0.27416236183659926, "grad_norm": 27.34719190524476, "learning_rate": 7.259712520925936e-06, "loss": 0.3995, "step": 4750 }, { "epoch": 0.27473954575625525, "grad_norm": 2.5984426690848044, "learning_rate": 7.2539398487559894e-06, "loss": 0.4091, "step": 4760 }, { "epoch": 0.27531672967591125, "grad_norm": 2.269931164816007, "learning_rate": 7.248167176586042e-06, "loss": 0.391, "step": 4770 }, { "epoch": 0.27589391359556725, "grad_norm": 2.0949435472109443, "learning_rate": 7.242394504416095e-06, "loss": 0.3867, "step": 4780 }, { "epoch": 0.27647109751522325, "grad_norm": 2.1688865736563794, "learning_rate": 7.236621832246147e-06, "loss": 0.3819, "step": 4790 }, { "epoch": 0.27704828143487925, "grad_norm": 3.6275115123885744, "learning_rate": 7.2308491600762e-06, "loss": 0.403, "step": 4800 }, { "epoch": 0.27762546535453525, "grad_norm": 2.7044630613298204, "learning_rate": 7.225076487906253e-06, "loss": 0.401, "step": 4810 }, { "epoch": 0.2782026492741912, "grad_norm": 3.6256795573786853, "learning_rate": 7.219303815736305e-06, "loss": 0.4008, "step": 4820 }, { "epoch": 0.2787798331938472, "grad_norm": 2.3560879474365595, "learning_rate": 7.213531143566358e-06, "loss": 0.382, "step": 4830 }, { "epoch": 0.2793570171135032, "grad_norm": 6.363609259389832, "learning_rate": 7.20775847139641e-06, "loss": 0.3889, "step": 4840 }, { "epoch": 0.2799342010331592, "grad_norm": 2.447343796783594, "learning_rate": 7.201985799226463e-06, "loss": 0.3819, "step": 4850 }, { "epoch": 0.2805113849528152, "grad_norm": 4.454942195776334, "learning_rate": 7.1962131270565146e-06, "loss": 0.4029, "step": 4860 }, { "epoch": 0.2810885688724712, "grad_norm": 4.7341175135353675, "learning_rate": 7.1904404548865684e-06, "loss": 0.3962, "step": 4870 }, { "epoch": 0.2816657527921272, "grad_norm": 1.8389081163765115, "learning_rate": 7.18466778271662e-06, "loss": 0.3962, "step": 4880 }, { "epoch": 0.2822429367117832, "grad_norm": 3.2296499793817612, "learning_rate": 7.178895110546673e-06, "loss": 0.3942, "step": 4890 }, { "epoch": 0.2828201206314392, "grad_norm": 3.947947823306894, "learning_rate": 7.173122438376725e-06, "loss": 0.3976, "step": 4900 }, { "epoch": 0.2833973045510952, "grad_norm": 3.562193655967395, "learning_rate": 7.167349766206778e-06, "loss": 0.3876, "step": 4910 }, { "epoch": 0.2839744884707512, "grad_norm": 2.0441973593521086, "learning_rate": 7.16157709403683e-06, "loss": 0.404, "step": 4920 }, { "epoch": 0.2845516723904072, "grad_norm": 2.3917393046670075, "learning_rate": 7.155804421866883e-06, "loss": 0.4169, "step": 4930 }, { "epoch": 0.2851288563100632, "grad_norm": 2.328855705516118, "learning_rate": 7.150031749696935e-06, "loss": 0.3886, "step": 4940 }, { "epoch": 0.2857060402297192, "grad_norm": 5.150276479313902, "learning_rate": 7.144259077526988e-06, "loss": 0.401, "step": 4950 }, { "epoch": 0.2862832241493752, "grad_norm": 2.4042125393360907, "learning_rate": 7.1384864053570405e-06, "loss": 0.4003, "step": 4960 }, { "epoch": 0.2868604080690312, "grad_norm": 2.8838052574781257, "learning_rate": 7.1327137331870936e-06, "loss": 0.3992, "step": 4970 }, { "epoch": 0.2874375919886872, "grad_norm": 3.3196900990562646, "learning_rate": 7.126941061017145e-06, "loss": 0.4025, "step": 4980 }, { "epoch": 0.2880147759083432, "grad_norm": 5.299768426314854, "learning_rate": 7.121168388847198e-06, "loss": 0.3994, "step": 4990 }, { "epoch": 0.2885919598279992, "grad_norm": 27.899807155688983, "learning_rate": 7.11539571667725e-06, "loss": 0.402, "step": 5000 }, { "epoch": 0.2891691437476552, "grad_norm": 2.2901385928116484, "learning_rate": 7.109623044507303e-06, "loss": 0.3936, "step": 5010 }, { "epoch": 0.2897463276673112, "grad_norm": 5.293699045227353, "learning_rate": 7.103850372337355e-06, "loss": 0.3977, "step": 5020 }, { "epoch": 0.2903235115869672, "grad_norm": 2.2638394900698624, "learning_rate": 7.098077700167408e-06, "loss": 0.3932, "step": 5030 }, { "epoch": 0.2909006955066232, "grad_norm": 2.2021953017040317, "learning_rate": 7.0923050279974605e-06, "loss": 0.4063, "step": 5040 }, { "epoch": 0.2914778794262792, "grad_norm": 3.0086020205487363, "learning_rate": 7.0865323558275135e-06, "loss": 0.3948, "step": 5050 }, { "epoch": 0.2920550633459352, "grad_norm": 2.1426519571708615, "learning_rate": 7.080759683657566e-06, "loss": 0.3967, "step": 5060 }, { "epoch": 0.29263224726559117, "grad_norm": 2.396350457849943, "learning_rate": 7.074987011487619e-06, "loss": 0.3699, "step": 5070 }, { "epoch": 0.29320943118524717, "grad_norm": 48.09761207732283, "learning_rate": 7.06921433931767e-06, "loss": 0.3783, "step": 5080 }, { "epoch": 0.29378661510490317, "grad_norm": 1.9858233601346664, "learning_rate": 7.063441667147724e-06, "loss": 0.3885, "step": 5090 }, { "epoch": 0.29436379902455917, "grad_norm": 1.8720378976655367, "learning_rate": 7.057668994977775e-06, "loss": 0.408, "step": 5100 }, { "epoch": 0.29494098294421517, "grad_norm": 1.9939149039664568, "learning_rate": 7.051896322807828e-06, "loss": 0.3954, "step": 5110 }, { "epoch": 0.29551816686387117, "grad_norm": 1.654582540560213, "learning_rate": 7.04612365063788e-06, "loss": 0.3914, "step": 5120 }, { "epoch": 0.29609535078352717, "grad_norm": 3.06484715205326, "learning_rate": 7.040350978467933e-06, "loss": 0.3941, "step": 5130 }, { "epoch": 0.29667253470318317, "grad_norm": 4.663889722159032, "learning_rate": 7.034578306297986e-06, "loss": 0.3864, "step": 5140 }, { "epoch": 0.29724971862283917, "grad_norm": 7.729000917336516, "learning_rate": 7.028805634128039e-06, "loss": 0.383, "step": 5150 }, { "epoch": 0.29782690254249516, "grad_norm": 3.2930128906393357, "learning_rate": 7.023032961958091e-06, "loss": 0.4141, "step": 5160 }, { "epoch": 0.29840408646215116, "grad_norm": 2.410906648107205, "learning_rate": 7.017260289788144e-06, "loss": 0.3954, "step": 5170 }, { "epoch": 0.29898127038180716, "grad_norm": 2.334082494973446, "learning_rate": 7.011487617618196e-06, "loss": 0.3946, "step": 5180 }, { "epoch": 0.29955845430146316, "grad_norm": 2.5351433587502568, "learning_rate": 7.005714945448249e-06, "loss": 0.3793, "step": 5190 }, { "epoch": 0.30013563822111916, "grad_norm": 4.293591510370919, "learning_rate": 6.9999422732783e-06, "loss": 0.3932, "step": 5200 }, { "epoch": 0.30071282214077516, "grad_norm": 2.7812977150432032, "learning_rate": 6.994169601108353e-06, "loss": 0.4031, "step": 5210 }, { "epoch": 0.30129000606043116, "grad_norm": 2.6445327211143392, "learning_rate": 6.9883969289384055e-06, "loss": 0.3971, "step": 5220 }, { "epoch": 0.30186718998008716, "grad_norm": 3.667055707656093, "learning_rate": 6.9826242567684585e-06, "loss": 0.3863, "step": 5230 }, { "epoch": 0.30244437389974316, "grad_norm": 7.251793531417223, "learning_rate": 6.976851584598511e-06, "loss": 0.3957, "step": 5240 }, { "epoch": 0.30302155781939916, "grad_norm": 2.3512832022952415, "learning_rate": 6.971078912428564e-06, "loss": 0.3795, "step": 5250 }, { "epoch": 0.30359874173905516, "grad_norm": 3.8467854833379103, "learning_rate": 6.965306240258616e-06, "loss": 0.4022, "step": 5260 }, { "epoch": 0.30417592565871115, "grad_norm": 3.452869510119873, "learning_rate": 6.959533568088669e-06, "loss": 0.367, "step": 5270 }, { "epoch": 0.30475310957836715, "grad_norm": 1.7928454544936765, "learning_rate": 6.953760895918721e-06, "loss": 0.3835, "step": 5280 }, { "epoch": 0.30533029349802315, "grad_norm": 3.3141188092724057, "learning_rate": 6.947988223748774e-06, "loss": 0.3841, "step": 5290 }, { "epoch": 0.30590747741767915, "grad_norm": 7.1200801655892905, "learning_rate": 6.942215551578826e-06, "loss": 0.3851, "step": 5300 }, { "epoch": 0.30648466133733515, "grad_norm": 2.6078657985349563, "learning_rate": 6.936442879408879e-06, "loss": 0.3851, "step": 5310 }, { "epoch": 0.30706184525699115, "grad_norm": 3.8742908966217873, "learning_rate": 6.930670207238931e-06, "loss": 0.3657, "step": 5320 }, { "epoch": 0.30763902917664715, "grad_norm": 2.0698861633639885, "learning_rate": 6.924897535068984e-06, "loss": 0.3911, "step": 5330 }, { "epoch": 0.30821621309630315, "grad_norm": 2.5491051422292412, "learning_rate": 6.919124862899037e-06, "loss": 0.3765, "step": 5340 }, { "epoch": 0.30879339701595915, "grad_norm": 3.734949482545124, "learning_rate": 6.913352190729089e-06, "loss": 0.3895, "step": 5350 }, { "epoch": 0.30937058093561515, "grad_norm": 3.6071556886180356, "learning_rate": 6.907579518559142e-06, "loss": 0.3855, "step": 5360 }, { "epoch": 0.30994776485527115, "grad_norm": 2.048967073465003, "learning_rate": 6.901806846389194e-06, "loss": 0.3955, "step": 5370 }, { "epoch": 0.31052494877492715, "grad_norm": 3.6739243647918016, "learning_rate": 6.896034174219247e-06, "loss": 0.3886, "step": 5380 }, { "epoch": 0.31110213269458314, "grad_norm": 6.86781835267949, "learning_rate": 6.890261502049299e-06, "loss": 0.3865, "step": 5390 }, { "epoch": 0.31167931661423914, "grad_norm": 2.3848433309003445, "learning_rate": 6.884488829879352e-06, "loss": 0.382, "step": 5400 }, { "epoch": 0.31225650053389514, "grad_norm": 4.973233732358959, "learning_rate": 6.8787161577094044e-06, "loss": 0.3902, "step": 5410 }, { "epoch": 0.31283368445355114, "grad_norm": 4.047034417439155, "learning_rate": 6.8729434855394575e-06, "loss": 0.3848, "step": 5420 }, { "epoch": 0.31341086837320714, "grad_norm": 2.0502326573281464, "learning_rate": 6.867170813369509e-06, "loss": 0.3775, "step": 5430 }, { "epoch": 0.31398805229286314, "grad_norm": 2.3429460144586747, "learning_rate": 6.861398141199563e-06, "loss": 0.3854, "step": 5440 }, { "epoch": 0.31456523621251914, "grad_norm": 2.366729120218485, "learning_rate": 6.855625469029614e-06, "loss": 0.392, "step": 5450 }, { "epoch": 0.31514242013217514, "grad_norm": 2.846824703607735, "learning_rate": 6.849852796859667e-06, "loss": 0.3899, "step": 5460 }, { "epoch": 0.31571960405183114, "grad_norm": 4.059470208038008, "learning_rate": 6.844080124689719e-06, "loss": 0.3824, "step": 5470 }, { "epoch": 0.31629678797148714, "grad_norm": 3.060450364883157, "learning_rate": 6.838307452519772e-06, "loss": 0.386, "step": 5480 }, { "epoch": 0.31687397189114314, "grad_norm": 6.0419702046843735, "learning_rate": 6.832534780349824e-06, "loss": 0.3989, "step": 5490 }, { "epoch": 0.31745115581079913, "grad_norm": 148.73139965795136, "learning_rate": 6.826762108179877e-06, "loss": 0.3972, "step": 5500 }, { "epoch": 0.31802833973045513, "grad_norm": 4.457525994355414, "learning_rate": 6.8209894360099296e-06, "loss": 0.3917, "step": 5510 }, { "epoch": 0.31860552365011113, "grad_norm": 2.172441867365423, "learning_rate": 6.815216763839983e-06, "loss": 0.3877, "step": 5520 }, { "epoch": 0.31918270756976713, "grad_norm": 3.287076044270403, "learning_rate": 6.809444091670035e-06, "loss": 0.3697, "step": 5530 }, { "epoch": 0.31975989148942313, "grad_norm": 2.0889311701748747, "learning_rate": 6.803671419500088e-06, "loss": 0.3744, "step": 5540 }, { "epoch": 0.32033707540907913, "grad_norm": 3.2176355512083616, "learning_rate": 6.797898747330139e-06, "loss": 0.3936, "step": 5550 }, { "epoch": 0.3209142593287351, "grad_norm": 2.16842859391982, "learning_rate": 6.792126075160192e-06, "loss": 0.3945, "step": 5560 }, { "epoch": 0.3214914432483911, "grad_norm": 3.8670603387604134, "learning_rate": 6.786353402990244e-06, "loss": 0.3883, "step": 5570 }, { "epoch": 0.3220686271680471, "grad_norm": 16.096137517550215, "learning_rate": 6.780580730820297e-06, "loss": 0.3982, "step": 5580 }, { "epoch": 0.32264581108770307, "grad_norm": 3.253563357226522, "learning_rate": 6.7748080586503495e-06, "loss": 0.3971, "step": 5590 }, { "epoch": 0.32322299500735907, "grad_norm": 5.849409027392066, "learning_rate": 6.7690353864804025e-06, "loss": 0.3966, "step": 5600 }, { "epoch": 0.32380017892701507, "grad_norm": 4.446139033898797, "learning_rate": 6.763262714310455e-06, "loss": 0.3871, "step": 5610 }, { "epoch": 0.32437736284667107, "grad_norm": 2.4319593332542953, "learning_rate": 6.757490042140508e-06, "loss": 0.381, "step": 5620 }, { "epoch": 0.32495454676632707, "grad_norm": 4.734141112466146, "learning_rate": 6.75171736997056e-06, "loss": 0.3688, "step": 5630 }, { "epoch": 0.32553173068598307, "grad_norm": 3.0227214340364386, "learning_rate": 6.745944697800613e-06, "loss": 0.3939, "step": 5640 }, { "epoch": 0.32610891460563907, "grad_norm": 6.9528804097069274, "learning_rate": 6.740172025630664e-06, "loss": 0.3859, "step": 5650 }, { "epoch": 0.32668609852529507, "grad_norm": 2.4438240264660527, "learning_rate": 6.734399353460718e-06, "loss": 0.3929, "step": 5660 }, { "epoch": 0.32726328244495106, "grad_norm": 7.001401722934106, "learning_rate": 6.728626681290769e-06, "loss": 0.3878, "step": 5670 }, { "epoch": 0.32784046636460706, "grad_norm": 4.631134068889566, "learning_rate": 6.7228540091208224e-06, "loss": 0.3886, "step": 5680 }, { "epoch": 0.32841765028426306, "grad_norm": 3.0416584434332274, "learning_rate": 6.717081336950875e-06, "loss": 0.3732, "step": 5690 }, { "epoch": 0.32899483420391906, "grad_norm": 3.189118603447828, "learning_rate": 6.711308664780928e-06, "loss": 0.3788, "step": 5700 }, { "epoch": 0.32957201812357506, "grad_norm": 4.412271751435158, "learning_rate": 6.70553599261098e-06, "loss": 0.3903, "step": 5710 }, { "epoch": 0.33014920204323106, "grad_norm": 5.781124605717443, "learning_rate": 6.699763320441033e-06, "loss": 0.3742, "step": 5720 }, { "epoch": 0.33072638596288706, "grad_norm": 2.38240681303682, "learning_rate": 6.693990648271085e-06, "loss": 0.3846, "step": 5730 }, { "epoch": 0.33130356988254306, "grad_norm": 5.627940078972001, "learning_rate": 6.688217976101138e-06, "loss": 0.383, "step": 5740 }, { "epoch": 0.33188075380219906, "grad_norm": 2.5562400913295695, "learning_rate": 6.68244530393119e-06, "loss": 0.388, "step": 5750 }, { "epoch": 0.33245793772185506, "grad_norm": 2.009018555010131, "learning_rate": 6.676672631761243e-06, "loss": 0.3863, "step": 5760 }, { "epoch": 0.33303512164151106, "grad_norm": 2.6584190178994223, "learning_rate": 6.6708999595912945e-06, "loss": 0.382, "step": 5770 }, { "epoch": 0.33361230556116706, "grad_norm": 1.6637756869209672, "learning_rate": 6.6651272874213476e-06, "loss": 0.384, "step": 5780 }, { "epoch": 0.33418948948082305, "grad_norm": 2.3195781804624174, "learning_rate": 6.6593546152514e-06, "loss": 0.3766, "step": 5790 }, { "epoch": 0.33476667340047905, "grad_norm": 3.760084084073311, "learning_rate": 6.653581943081453e-06, "loss": 0.4035, "step": 5800 }, { "epoch": 0.33534385732013505, "grad_norm": 2.1527295119213607, "learning_rate": 6.647809270911505e-06, "loss": 0.3746, "step": 5810 }, { "epoch": 0.33592104123979105, "grad_norm": 2.50518271064483, "learning_rate": 6.642036598741558e-06, "loss": 0.3788, "step": 5820 }, { "epoch": 0.33649822515944705, "grad_norm": 3.4692947058918895, "learning_rate": 6.63626392657161e-06, "loss": 0.3893, "step": 5830 }, { "epoch": 0.33707540907910305, "grad_norm": 2.512775623033029, "learning_rate": 6.630491254401663e-06, "loss": 0.3719, "step": 5840 }, { "epoch": 0.33765259299875905, "grad_norm": 1.9666671304858914, "learning_rate": 6.624718582231716e-06, "loss": 0.3785, "step": 5850 }, { "epoch": 0.33822977691841505, "grad_norm": 2.724605715859374, "learning_rate": 6.618945910061768e-06, "loss": 0.4004, "step": 5860 }, { "epoch": 0.33880696083807105, "grad_norm": 2.7489712656132044, "learning_rate": 6.613173237891821e-06, "loss": 0.3771, "step": 5870 }, { "epoch": 0.33938414475772705, "grad_norm": 2.599485813176942, "learning_rate": 6.6074005657218735e-06, "loss": 0.4003, "step": 5880 }, { "epoch": 0.33996132867738305, "grad_norm": 2.746831225729797, "learning_rate": 6.6016278935519266e-06, "loss": 0.3699, "step": 5890 }, { "epoch": 0.34053851259703904, "grad_norm": 10.192368895005096, "learning_rate": 6.595855221381978e-06, "loss": 0.3825, "step": 5900 }, { "epoch": 0.34111569651669504, "grad_norm": 2.3362074486231785, "learning_rate": 6.590082549212031e-06, "loss": 0.3816, "step": 5910 }, { "epoch": 0.34169288043635104, "grad_norm": 3.2505345689597194, "learning_rate": 6.584309877042083e-06, "loss": 0.4147, "step": 5920 }, { "epoch": 0.34227006435600704, "grad_norm": 2.771813125028045, "learning_rate": 6.578537204872136e-06, "loss": 0.3822, "step": 5930 }, { "epoch": 0.34284724827566304, "grad_norm": 2.600143172725041, "learning_rate": 6.572764532702188e-06, "loss": 0.3718, "step": 5940 }, { "epoch": 0.34342443219531904, "grad_norm": 3.904138501463535, "learning_rate": 6.566991860532241e-06, "loss": 0.3886, "step": 5950 }, { "epoch": 0.34400161611497504, "grad_norm": 5.0319871188012515, "learning_rate": 6.5612191883622935e-06, "loss": 0.3877, "step": 5960 }, { "epoch": 0.34457880003463104, "grad_norm": 13.625359588311552, "learning_rate": 6.5554465161923465e-06, "loss": 0.3687, "step": 5970 }, { "epoch": 0.34515598395428704, "grad_norm": 2.920042053925227, "learning_rate": 6.549673844022399e-06, "loss": 0.3859, "step": 5980 }, { "epoch": 0.34573316787394304, "grad_norm": 3.5277767201369223, "learning_rate": 6.543901171852452e-06, "loss": 0.3789, "step": 5990 }, { "epoch": 0.34631035179359904, "grad_norm": 2.7899995578571426, "learning_rate": 6.538128499682503e-06, "loss": 0.3668, "step": 6000 }, { "epoch": 0.34688753571325504, "grad_norm": 5.64453785605591, "learning_rate": 6.532355827512557e-06, "loss": 0.3824, "step": 6010 }, { "epoch": 0.34746471963291103, "grad_norm": 3.440174338768187, "learning_rate": 6.526583155342608e-06, "loss": 0.3844, "step": 6020 }, { "epoch": 0.34804190355256703, "grad_norm": 3.9486486049020635, "learning_rate": 6.520810483172661e-06, "loss": 0.3897, "step": 6030 }, { "epoch": 0.34861908747222303, "grad_norm": 7.253991235141298, "learning_rate": 6.515037811002713e-06, "loss": 0.3677, "step": 6040 }, { "epoch": 0.34919627139187903, "grad_norm": 7.656685618930045, "learning_rate": 6.509265138832766e-06, "loss": 0.388, "step": 6050 }, { "epoch": 0.34977345531153503, "grad_norm": 2.213875264653562, "learning_rate": 6.503492466662819e-06, "loss": 0.3851, "step": 6060 }, { "epoch": 0.35035063923119103, "grad_norm": 5.474678610165808, "learning_rate": 6.497719794492872e-06, "loss": 0.3777, "step": 6070 }, { "epoch": 0.35092782315084703, "grad_norm": 4.538506198098333, "learning_rate": 6.491947122322924e-06, "loss": 0.3606, "step": 6080 }, { "epoch": 0.35150500707050303, "grad_norm": 2.7367624612828627, "learning_rate": 6.486174450152977e-06, "loss": 0.3824, "step": 6090 }, { "epoch": 0.352082190990159, "grad_norm": 3.9600996597048432, "learning_rate": 6.480401777983029e-06, "loss": 0.3865, "step": 6100 }, { "epoch": 0.352659374909815, "grad_norm": 6.138903729575434, "learning_rate": 6.474629105813082e-06, "loss": 0.3836, "step": 6110 }, { "epoch": 0.353236558829471, "grad_norm": 3.1110082739843676, "learning_rate": 6.468856433643133e-06, "loss": 0.3941, "step": 6120 }, { "epoch": 0.353813742749127, "grad_norm": 3.0125957250660997, "learning_rate": 6.463083761473186e-06, "loss": 0.3907, "step": 6130 }, { "epoch": 0.354390926668783, "grad_norm": 5.777332948819984, "learning_rate": 6.4573110893032385e-06, "loss": 0.3926, "step": 6140 }, { "epoch": 0.354968110588439, "grad_norm": 10.69646287431515, "learning_rate": 6.4515384171332915e-06, "loss": 0.3892, "step": 6150 }, { "epoch": 0.355545294508095, "grad_norm": 5.389336302495197, "learning_rate": 6.445765744963344e-06, "loss": 0.3957, "step": 6160 }, { "epoch": 0.356122478427751, "grad_norm": 12.802235491479053, "learning_rate": 6.439993072793397e-06, "loss": 0.3742, "step": 6170 }, { "epoch": 0.356699662347407, "grad_norm": 4.196338750242119, "learning_rate": 6.434220400623449e-06, "loss": 0.3878, "step": 6180 }, { "epoch": 0.357276846267063, "grad_norm": 3.7684375534000276, "learning_rate": 6.428447728453502e-06, "loss": 0.3724, "step": 6190 }, { "epoch": 0.357854030186719, "grad_norm": 2.4825477710744446, "learning_rate": 6.422675056283554e-06, "loss": 0.3665, "step": 6200 }, { "epoch": 0.358431214106375, "grad_norm": 2.5273547043428244, "learning_rate": 6.416902384113607e-06, "loss": 0.3682, "step": 6210 }, { "epoch": 0.359008398026031, "grad_norm": 3.3691141387535453, "learning_rate": 6.4111297119436584e-06, "loss": 0.3884, "step": 6220 }, { "epoch": 0.359585581945687, "grad_norm": 3.986041227799815, "learning_rate": 6.405357039773712e-06, "loss": 0.3793, "step": 6230 }, { "epoch": 0.360162765865343, "grad_norm": 4.388692532796717, "learning_rate": 6.399584367603764e-06, "loss": 0.3826, "step": 6240 }, { "epoch": 0.360739949784999, "grad_norm": 5.61124433419293, "learning_rate": 6.393811695433817e-06, "loss": 0.3725, "step": 6250 }, { "epoch": 0.361317133704655, "grad_norm": 5.79217310796387, "learning_rate": 6.388039023263869e-06, "loss": 0.3803, "step": 6260 }, { "epoch": 0.361894317624311, "grad_norm": 3.3092133128777017, "learning_rate": 6.382266351093922e-06, "loss": 0.3727, "step": 6270 }, { "epoch": 0.362471501543967, "grad_norm": 2.6436967571480308, "learning_rate": 6.376493678923974e-06, "loss": 0.3869, "step": 6280 }, { "epoch": 0.363048685463623, "grad_norm": 4.870192599092706, "learning_rate": 6.370721006754027e-06, "loss": 0.3711, "step": 6290 }, { "epoch": 0.36362586938327895, "grad_norm": 6.412850489358521, "learning_rate": 6.364948334584079e-06, "loss": 0.3778, "step": 6300 }, { "epoch": 0.36420305330293495, "grad_norm": 6.723734658950526, "learning_rate": 6.359175662414132e-06, "loss": 0.375, "step": 6310 }, { "epoch": 0.36478023722259095, "grad_norm": 2.9855811704461916, "learning_rate": 6.353402990244184e-06, "loss": 0.3968, "step": 6320 }, { "epoch": 0.36535742114224695, "grad_norm": 4.253318369577758, "learning_rate": 6.3476303180742374e-06, "loss": 0.3835, "step": 6330 }, { "epoch": 0.36593460506190295, "grad_norm": 3.350507256204714, "learning_rate": 6.341857645904289e-06, "loss": 0.3934, "step": 6340 }, { "epoch": 0.36651178898155895, "grad_norm": 7.6596745260829024, "learning_rate": 6.336084973734342e-06, "loss": 0.3695, "step": 6350 }, { "epoch": 0.36708897290121495, "grad_norm": 5.170501025053992, "learning_rate": 6.330312301564394e-06, "loss": 0.3822, "step": 6360 }, { "epoch": 0.36766615682087095, "grad_norm": 3.3572394366722342, "learning_rate": 6.324539629394447e-06, "loss": 0.3756, "step": 6370 }, { "epoch": 0.36824334074052695, "grad_norm": 4.222113472184697, "learning_rate": 6.3187669572245e-06, "loss": 0.3934, "step": 6380 }, { "epoch": 0.36882052466018295, "grad_norm": 4.239041230078147, "learning_rate": 6.312994285054552e-06, "loss": 0.3841, "step": 6390 }, { "epoch": 0.36939770857983895, "grad_norm": 3.8859530952931425, "learning_rate": 6.307221612884605e-06, "loss": 0.3747, "step": 6400 }, { "epoch": 0.36997489249949495, "grad_norm": 5.0107950962438315, "learning_rate": 6.301448940714657e-06, "loss": 0.3781, "step": 6410 }, { "epoch": 0.37055207641915094, "grad_norm": 19.339302178387896, "learning_rate": 6.29567626854471e-06, "loss": 0.3804, "step": 6420 }, { "epoch": 0.37112926033880694, "grad_norm": 103.6902739739912, "learning_rate": 6.2899035963747626e-06, "loss": 0.3614, "step": 6430 }, { "epoch": 0.37170644425846294, "grad_norm": 9.795833830078546, "learning_rate": 6.284130924204816e-06, "loss": 0.3806, "step": 6440 }, { "epoch": 0.37228362817811894, "grad_norm": 2.835184422290757, "learning_rate": 6.278358252034868e-06, "loss": 0.3635, "step": 6450 }, { "epoch": 0.37286081209777494, "grad_norm": 17.867434742010555, "learning_rate": 6.272585579864921e-06, "loss": 0.3825, "step": 6460 }, { "epoch": 0.37343799601743094, "grad_norm": 3.5064803159499554, "learning_rate": 6.266812907694972e-06, "loss": 0.3777, "step": 6470 }, { "epoch": 0.37401517993708694, "grad_norm": 6.18556265562049, "learning_rate": 6.261040235525025e-06, "loss": 0.354, "step": 6480 }, { "epoch": 0.37459236385674294, "grad_norm": 5.3643329344286, "learning_rate": 6.255267563355077e-06, "loss": 0.3653, "step": 6490 }, { "epoch": 0.37516954777639894, "grad_norm": 4.370066633666132, "learning_rate": 6.24949489118513e-06, "loss": 0.3799, "step": 6500 }, { "epoch": 0.37574673169605494, "grad_norm": 3.802802160469247, "learning_rate": 6.2437222190151825e-06, "loss": 0.3771, "step": 6510 }, { "epoch": 0.37632391561571094, "grad_norm": 2.999312565631662, "learning_rate": 6.2379495468452355e-06, "loss": 0.3761, "step": 6520 }, { "epoch": 0.37690109953536693, "grad_norm": 7.852310497644898, "learning_rate": 6.232176874675288e-06, "loss": 0.3823, "step": 6530 }, { "epoch": 0.37747828345502293, "grad_norm": 4.876630434197547, "learning_rate": 6.226404202505341e-06, "loss": 0.3704, "step": 6540 }, { "epoch": 0.37805546737467893, "grad_norm": 4.989568751322678, "learning_rate": 6.220631530335393e-06, "loss": 0.3836, "step": 6550 }, { "epoch": 0.37863265129433493, "grad_norm": 3.7548796873491135, "learning_rate": 6.214858858165446e-06, "loss": 0.3759, "step": 6560 }, { "epoch": 0.37920983521399093, "grad_norm": 2.7376006597130513, "learning_rate": 6.209086185995497e-06, "loss": 0.3825, "step": 6570 }, { "epoch": 0.37978701913364693, "grad_norm": 3.835955921955649, "learning_rate": 6.203313513825551e-06, "loss": 0.3892, "step": 6580 }, { "epoch": 0.38036420305330293, "grad_norm": 2.728138816573778, "learning_rate": 6.197540841655602e-06, "loss": 0.3806, "step": 6590 }, { "epoch": 0.38094138697295893, "grad_norm": 6.331854012865428, "learning_rate": 6.1917681694856554e-06, "loss": 0.3849, "step": 6600 }, { "epoch": 0.3815185708926149, "grad_norm": 2.7712160688455394, "learning_rate": 6.185995497315708e-06, "loss": 0.366, "step": 6610 }, { "epoch": 0.3820957548122709, "grad_norm": 2.2697691277132406, "learning_rate": 6.180222825145761e-06, "loss": 0.3625, "step": 6620 }, { "epoch": 0.3826729387319269, "grad_norm": 4.51763189851551, "learning_rate": 6.174450152975813e-06, "loss": 0.3745, "step": 6630 }, { "epoch": 0.3832501226515829, "grad_norm": 22.00920716007038, "learning_rate": 6.168677480805866e-06, "loss": 0.379, "step": 6640 }, { "epoch": 0.3838273065712389, "grad_norm": 8.50487988964264, "learning_rate": 6.162904808635918e-06, "loss": 0.3798, "step": 6650 }, { "epoch": 0.3844044904908949, "grad_norm": 2.756542308650349, "learning_rate": 6.157132136465971e-06, "loss": 0.3777, "step": 6660 }, { "epoch": 0.3849816744105509, "grad_norm": 1.8967629666152492, "learning_rate": 6.151359464296023e-06, "loss": 0.3536, "step": 6670 }, { "epoch": 0.3855588583302069, "grad_norm": 2.208647530669507, "learning_rate": 6.145586792126076e-06, "loss": 0.3757, "step": 6680 }, { "epoch": 0.3861360422498629, "grad_norm": 3.18500818944882, "learning_rate": 6.1398141199561275e-06, "loss": 0.381, "step": 6690 }, { "epoch": 0.3867132261695189, "grad_norm": 3.7319272107204267, "learning_rate": 6.1340414477861806e-06, "loss": 0.3895, "step": 6700 }, { "epoch": 0.3872904100891749, "grad_norm": 6.233879379077169, "learning_rate": 6.128268775616233e-06, "loss": 0.3931, "step": 6710 }, { "epoch": 0.3878675940088309, "grad_norm": 2.5172058960090147, "learning_rate": 6.122496103446286e-06, "loss": 0.3696, "step": 6720 }, { "epoch": 0.3884447779284869, "grad_norm": 2.4821687545852544, "learning_rate": 6.116723431276338e-06, "loss": 0.3783, "step": 6730 }, { "epoch": 0.3890219618481429, "grad_norm": 2.5811379708324984, "learning_rate": 6.110950759106391e-06, "loss": 0.3883, "step": 6740 }, { "epoch": 0.3895991457677989, "grad_norm": 4.606721510393016, "learning_rate": 6.105178086936443e-06, "loss": 0.364, "step": 6750 }, { "epoch": 0.3901763296874549, "grad_norm": 5.353229433626119, "learning_rate": 6.099405414766496e-06, "loss": 0.3882, "step": 6760 }, { "epoch": 0.3907535136071109, "grad_norm": 2.3516345262109617, "learning_rate": 6.093632742596548e-06, "loss": 0.3788, "step": 6770 }, { "epoch": 0.3913306975267669, "grad_norm": 11.487680253286674, "learning_rate": 6.087860070426601e-06, "loss": 0.3889, "step": 6780 }, { "epoch": 0.3919078814464229, "grad_norm": 3.18290202413646, "learning_rate": 6.082087398256653e-06, "loss": 0.3607, "step": 6790 }, { "epoch": 0.3924850653660789, "grad_norm": 2.7380986355865917, "learning_rate": 6.0763147260867065e-06, "loss": 0.3809, "step": 6800 }, { "epoch": 0.3930622492857349, "grad_norm": 2.985403565819371, "learning_rate": 6.070542053916758e-06, "loss": 0.3785, "step": 6810 }, { "epoch": 0.3936394332053909, "grad_norm": 7.257850197480963, "learning_rate": 6.064769381746811e-06, "loss": 0.3813, "step": 6820 }, { "epoch": 0.3942166171250469, "grad_norm": 2.651981631840983, "learning_rate": 6.058996709576863e-06, "loss": 0.3911, "step": 6830 }, { "epoch": 0.3947938010447029, "grad_norm": 3.007540853480136, "learning_rate": 6.053224037406916e-06, "loss": 0.3787, "step": 6840 }, { "epoch": 0.3953709849643589, "grad_norm": 4.967113215124695, "learning_rate": 6.047451365236968e-06, "loss": 0.3729, "step": 6850 }, { "epoch": 0.3959481688840149, "grad_norm": 2.4113519734571627, "learning_rate": 6.041678693067021e-06, "loss": 0.3576, "step": 6860 }, { "epoch": 0.3965253528036709, "grad_norm": 1.5215990778439656, "learning_rate": 6.0359060208970734e-06, "loss": 0.3813, "step": 6870 }, { "epoch": 0.3971025367233269, "grad_norm": 1.9980571139407164, "learning_rate": 6.0301333487271265e-06, "loss": 0.3764, "step": 6880 }, { "epoch": 0.3976797206429829, "grad_norm": 3.851850368869639, "learning_rate": 6.024360676557179e-06, "loss": 0.3793, "step": 6890 }, { "epoch": 0.3982569045626389, "grad_norm": 2.819413612633571, "learning_rate": 6.018588004387232e-06, "loss": 0.3915, "step": 6900 }, { "epoch": 0.3988340884822949, "grad_norm": 2.2801532893497733, "learning_rate": 6.012815332217285e-06, "loss": 0.3927, "step": 6910 }, { "epoch": 0.3994112724019509, "grad_norm": 3.0396536780138734, "learning_rate": 6.007042660047336e-06, "loss": 0.3787, "step": 6920 }, { "epoch": 0.3999884563216069, "grad_norm": 1.9908365824878806, "learning_rate": 6.00126998787739e-06, "loss": 0.3751, "step": 6930 }, { "epoch": 0.4005656402412629, "grad_norm": 2.5287281468598817, "learning_rate": 5.995497315707441e-06, "loss": 0.3739, "step": 6940 }, { "epoch": 0.4011428241609189, "grad_norm": 2.3877649628077404, "learning_rate": 5.989724643537494e-06, "loss": 0.3931, "step": 6950 }, { "epoch": 0.4017200080805749, "grad_norm": 1.751274625854989, "learning_rate": 5.983951971367546e-06, "loss": 0.3725, "step": 6960 }, { "epoch": 0.4022971920002309, "grad_norm": 7.097265852789204, "learning_rate": 5.978179299197599e-06, "loss": 0.3771, "step": 6970 }, { "epoch": 0.4028743759198869, "grad_norm": 1.5751898011207688, "learning_rate": 5.972406627027652e-06, "loss": 0.3804, "step": 6980 }, { "epoch": 0.4034515598395429, "grad_norm": 3.487612778253862, "learning_rate": 5.966633954857705e-06, "loss": 0.3677, "step": 6990 }, { "epoch": 0.4040287437591989, "grad_norm": 2.622057972311098, "learning_rate": 5.960861282687757e-06, "loss": 0.3778, "step": 7000 }, { "epoch": 0.4046059276788549, "grad_norm": 2.7368469799858532, "learning_rate": 5.95508861051781e-06, "loss": 0.3768, "step": 7010 }, { "epoch": 0.4051831115985109, "grad_norm": 1.6133398127083427, "learning_rate": 5.949315938347862e-06, "loss": 0.3799, "step": 7020 }, { "epoch": 0.4057602955181669, "grad_norm": 3.191334805976918, "learning_rate": 5.943543266177915e-06, "loss": 0.3813, "step": 7030 }, { "epoch": 0.40633747943782283, "grad_norm": 2.8991810624406784, "learning_rate": 5.937770594007966e-06, "loss": 0.376, "step": 7040 }, { "epoch": 0.40691466335747883, "grad_norm": 2.0785390805202684, "learning_rate": 5.931997921838019e-06, "loss": 0.3729, "step": 7050 }, { "epoch": 0.40749184727713483, "grad_norm": 1.9512094562324862, "learning_rate": 5.9262252496680715e-06, "loss": 0.3732, "step": 7060 }, { "epoch": 0.40806903119679083, "grad_norm": 3.3176725840902206, "learning_rate": 5.9204525774981245e-06, "loss": 0.3874, "step": 7070 }, { "epoch": 0.40864621511644683, "grad_norm": 3.000837724994079, "learning_rate": 5.914679905328177e-06, "loss": 0.3745, "step": 7080 }, { "epoch": 0.40922339903610283, "grad_norm": 1.8158962267665133, "learning_rate": 5.90890723315823e-06, "loss": 0.3756, "step": 7090 }, { "epoch": 0.40980058295575883, "grad_norm": 2.324389501252935, "learning_rate": 5.903134560988282e-06, "loss": 0.3886, "step": 7100 }, { "epoch": 0.41037776687541483, "grad_norm": 2.894571332845524, "learning_rate": 5.897361888818335e-06, "loss": 0.3869, "step": 7110 }, { "epoch": 0.41095495079507083, "grad_norm": 2.629677485680801, "learning_rate": 5.891589216648387e-06, "loss": 0.3615, "step": 7120 }, { "epoch": 0.4115321347147268, "grad_norm": 13.759434005163566, "learning_rate": 5.88581654447844e-06, "loss": 0.3678, "step": 7130 }, { "epoch": 0.4121093186343828, "grad_norm": 3.187249272214218, "learning_rate": 5.8800438723084915e-06, "loss": 0.3496, "step": 7140 }, { "epoch": 0.4126865025540388, "grad_norm": 25.01828326406148, "learning_rate": 5.874271200138545e-06, "loss": 0.3831, "step": 7150 }, { "epoch": 0.4132636864736948, "grad_norm": 4.28899556920541, "learning_rate": 5.868498527968597e-06, "loss": 0.3847, "step": 7160 }, { "epoch": 0.4138408703933508, "grad_norm": 5.017592395582479, "learning_rate": 5.86272585579865e-06, "loss": 0.3528, "step": 7170 }, { "epoch": 0.4144180543130068, "grad_norm": 3.6138133944499686, "learning_rate": 5.856953183628702e-06, "loss": 0.3615, "step": 7180 }, { "epoch": 0.4149952382326628, "grad_norm": 11.345281193048963, "learning_rate": 5.851180511458755e-06, "loss": 0.36, "step": 7190 }, { "epoch": 0.4155724221523188, "grad_norm": 4.1575514029124525, "learning_rate": 5.845407839288807e-06, "loss": 0.3707, "step": 7200 }, { "epoch": 0.4161496060719748, "grad_norm": 5.184879687211155, "learning_rate": 5.83963516711886e-06, "loss": 0.3584, "step": 7210 }, { "epoch": 0.4167267899916308, "grad_norm": 3.6353294525038256, "learning_rate": 5.833862494948912e-06, "loss": 0.3922, "step": 7220 }, { "epoch": 0.4173039739112868, "grad_norm": 10.083912587939164, "learning_rate": 5.828089822778965e-06, "loss": 0.358, "step": 7230 }, { "epoch": 0.4178811578309428, "grad_norm": 3.795430776940293, "learning_rate": 5.822317150609017e-06, "loss": 0.3584, "step": 7240 }, { "epoch": 0.4184583417505988, "grad_norm": 2.735432424886805, "learning_rate": 5.8165444784390704e-06, "loss": 0.3628, "step": 7250 }, { "epoch": 0.4190355256702548, "grad_norm": 3.7538394849350034, "learning_rate": 5.810771806269122e-06, "loss": 0.3808, "step": 7260 }, { "epoch": 0.4196127095899108, "grad_norm": 3.486146744000872, "learning_rate": 5.804999134099175e-06, "loss": 0.3677, "step": 7270 }, { "epoch": 0.4201898935095668, "grad_norm": 6.482596192596544, "learning_rate": 5.799226461929227e-06, "loss": 0.3715, "step": 7280 }, { "epoch": 0.4207670774292228, "grad_norm": 15.287481532081374, "learning_rate": 5.79345378975928e-06, "loss": 0.3607, "step": 7290 }, { "epoch": 0.4213442613488788, "grad_norm": 5.756011268210783, "learning_rate": 5.787681117589332e-06, "loss": 0.3829, "step": 7300 }, { "epoch": 0.4219214452685348, "grad_norm": 5.238271188240731, "learning_rate": 5.781908445419385e-06, "loss": 0.3685, "step": 7310 }, { "epoch": 0.4224986291881908, "grad_norm": 5.072523904302979, "learning_rate": 5.776135773249437e-06, "loss": 0.3785, "step": 7320 }, { "epoch": 0.4230758131078468, "grad_norm": 2.7230926250144494, "learning_rate": 5.77036310107949e-06, "loss": 0.3586, "step": 7330 }, { "epoch": 0.4236529970275028, "grad_norm": 3.1651643016202438, "learning_rate": 5.7645904289095425e-06, "loss": 0.3675, "step": 7340 }, { "epoch": 0.4242301809471588, "grad_norm": 5.575569273909336, "learning_rate": 5.7588177567395956e-06, "loss": 0.3659, "step": 7350 }, { "epoch": 0.4248073648668148, "grad_norm": 3.4372405530276686, "learning_rate": 5.753045084569647e-06, "loss": 0.3562, "step": 7360 }, { "epoch": 0.4253845487864708, "grad_norm": 3.1380962366302203, "learning_rate": 5.747272412399701e-06, "loss": 0.3665, "step": 7370 }, { "epoch": 0.4259617327061268, "grad_norm": 4.195020514299469, "learning_rate": 5.741499740229752e-06, "loss": 0.3834, "step": 7380 }, { "epoch": 0.4265389166257828, "grad_norm": 2.5201571788814103, "learning_rate": 5.735727068059805e-06, "loss": 0.3598, "step": 7390 }, { "epoch": 0.4271161005454388, "grad_norm": 7.757366621212017, "learning_rate": 5.729954395889857e-06, "loss": 0.365, "step": 7400 }, { "epoch": 0.4276932844650948, "grad_norm": 3.6863947123217438, "learning_rate": 5.72418172371991e-06, "loss": 0.3605, "step": 7410 }, { "epoch": 0.4282704683847508, "grad_norm": 2.713386138832286, "learning_rate": 5.7184090515499625e-06, "loss": 0.3692, "step": 7420 }, { "epoch": 0.4288476523044068, "grad_norm": 4.061486235134526, "learning_rate": 5.7126363793800155e-06, "loss": 0.3615, "step": 7430 }, { "epoch": 0.4294248362240628, "grad_norm": 3.171386095616653, "learning_rate": 5.7068637072100685e-06, "loss": 0.3742, "step": 7440 }, { "epoch": 0.4300020201437188, "grad_norm": 3.0632786743675173, "learning_rate": 5.701091035040121e-06, "loss": 0.3669, "step": 7450 }, { "epoch": 0.4305792040633748, "grad_norm": 5.9823682538619884, "learning_rate": 5.695318362870174e-06, "loss": 0.3625, "step": 7460 }, { "epoch": 0.4311563879830308, "grad_norm": 2.587936253733615, "learning_rate": 5.689545690700226e-06, "loss": 0.3654, "step": 7470 }, { "epoch": 0.4317335719026868, "grad_norm": 3.3311960193145507, "learning_rate": 5.683773018530279e-06, "loss": 0.3849, "step": 7480 }, { "epoch": 0.4323107558223428, "grad_norm": 7.2504266943512885, "learning_rate": 5.67800034636033e-06, "loss": 0.383, "step": 7490 }, { "epoch": 0.4328879397419988, "grad_norm": 2.565927845188403, "learning_rate": 5.672227674190383e-06, "loss": 0.3689, "step": 7500 }, { "epoch": 0.4334651236616548, "grad_norm": 2.596436540237007, "learning_rate": 5.6664550020204354e-06, "loss": 0.3783, "step": 7510 }, { "epoch": 0.4340423075813108, "grad_norm": 3.2568921956880397, "learning_rate": 5.6606823298504884e-06, "loss": 0.3622, "step": 7520 }, { "epoch": 0.4346194915009668, "grad_norm": 3.534180092969136, "learning_rate": 5.654909657680541e-06, "loss": 0.374, "step": 7530 }, { "epoch": 0.4351966754206228, "grad_norm": 2.320956209280894, "learning_rate": 5.649136985510594e-06, "loss": 0.3618, "step": 7540 }, { "epoch": 0.4357738593402788, "grad_norm": 2.7986565358175732, "learning_rate": 5.643364313340646e-06, "loss": 0.376, "step": 7550 }, { "epoch": 0.4363510432599348, "grad_norm": 4.3640055595975955, "learning_rate": 5.637591641170699e-06, "loss": 0.359, "step": 7560 }, { "epoch": 0.4369282271795908, "grad_norm": 2.4295878318318893, "learning_rate": 5.631818969000751e-06, "loss": 0.3837, "step": 7570 }, { "epoch": 0.4375054110992468, "grad_norm": 2.5358015892610304, "learning_rate": 5.626046296830804e-06, "loss": 0.3824, "step": 7580 }, { "epoch": 0.4380825950189028, "grad_norm": 2.732560193932699, "learning_rate": 5.620273624660856e-06, "loss": 0.3657, "step": 7590 }, { "epoch": 0.4386597789385588, "grad_norm": 4.150107259821488, "learning_rate": 5.614500952490909e-06, "loss": 0.3805, "step": 7600 }, { "epoch": 0.4392369628582148, "grad_norm": 6.027002919837396, "learning_rate": 5.6087282803209605e-06, "loss": 0.3733, "step": 7610 }, { "epoch": 0.4398141467778708, "grad_norm": 4.383047686001244, "learning_rate": 5.6029556081510136e-06, "loss": 0.3798, "step": 7620 }, { "epoch": 0.4403913306975268, "grad_norm": 3.183548428631444, "learning_rate": 5.597182935981066e-06, "loss": 0.3704, "step": 7630 }, { "epoch": 0.4409685146171828, "grad_norm": 3.2995502847867364, "learning_rate": 5.591410263811119e-06, "loss": 0.3868, "step": 7640 }, { "epoch": 0.4415456985368388, "grad_norm": 2.9302522543070384, "learning_rate": 5.585637591641171e-06, "loss": 0.3719, "step": 7650 }, { "epoch": 0.4421228824564948, "grad_norm": 3.699827330713927, "learning_rate": 5.579864919471224e-06, "loss": 0.3843, "step": 7660 }, { "epoch": 0.4427000663761508, "grad_norm": 10.131740140492866, "learning_rate": 5.574092247301276e-06, "loss": 0.3741, "step": 7670 }, { "epoch": 0.4432772502958068, "grad_norm": 2.53184828015941, "learning_rate": 5.568319575131329e-06, "loss": 0.3679, "step": 7680 }, { "epoch": 0.4438544342154628, "grad_norm": 2.735336367396379, "learning_rate": 5.562546902961381e-06, "loss": 0.3794, "step": 7690 }, { "epoch": 0.4444316181351188, "grad_norm": 3.118950089045635, "learning_rate": 5.556774230791434e-06, "loss": 0.3749, "step": 7700 }, { "epoch": 0.4450088020547748, "grad_norm": 2.0345152708541736, "learning_rate": 5.551001558621486e-06, "loss": 0.3797, "step": 7710 }, { "epoch": 0.4455859859744308, "grad_norm": 2.216729946357023, "learning_rate": 5.5452288864515395e-06, "loss": 0.3704, "step": 7720 }, { "epoch": 0.4461631698940868, "grad_norm": 11.605317744790039, "learning_rate": 5.539456214281591e-06, "loss": 0.3826, "step": 7730 }, { "epoch": 0.4467403538137428, "grad_norm": 5.555060033849291, "learning_rate": 5.533683542111644e-06, "loss": 0.3768, "step": 7740 }, { "epoch": 0.4473175377333988, "grad_norm": 2.5005756710793507, "learning_rate": 5.527910869941696e-06, "loss": 0.3667, "step": 7750 }, { "epoch": 0.4478947216530548, "grad_norm": 4.5009505264076655, "learning_rate": 5.522138197771749e-06, "loss": 0.3773, "step": 7760 }, { "epoch": 0.44847190557271077, "grad_norm": 2.6271424623008945, "learning_rate": 5.516365525601801e-06, "loss": 0.3737, "step": 7770 }, { "epoch": 0.4490490894923667, "grad_norm": 12.12839502479119, "learning_rate": 5.510592853431854e-06, "loss": 0.3573, "step": 7780 }, { "epoch": 0.4496262734120227, "grad_norm": 4.194839597547066, "learning_rate": 5.5048201812619064e-06, "loss": 0.3774, "step": 7790 }, { "epoch": 0.4502034573316787, "grad_norm": 2.1887367164733016, "learning_rate": 5.4990475090919595e-06, "loss": 0.381, "step": 7800 }, { "epoch": 0.4507806412513347, "grad_norm": 3.1886528624855925, "learning_rate": 5.493274836922012e-06, "loss": 0.37, "step": 7810 }, { "epoch": 0.4513578251709907, "grad_norm": 2.4110545743480527, "learning_rate": 5.487502164752065e-06, "loss": 0.3604, "step": 7820 }, { "epoch": 0.4519350090906467, "grad_norm": 2.9847050808092166, "learning_rate": 5.481729492582116e-06, "loss": 0.3675, "step": 7830 }, { "epoch": 0.4525121930103027, "grad_norm": 5.885118240316819, "learning_rate": 5.475956820412169e-06, "loss": 0.3856, "step": 7840 }, { "epoch": 0.4530893769299587, "grad_norm": 5.303575867358966, "learning_rate": 5.470184148242221e-06, "loss": 0.3607, "step": 7850 }, { "epoch": 0.4536665608496147, "grad_norm": 7.1549036006295035, "learning_rate": 5.464411476072274e-06, "loss": 0.3699, "step": 7860 }, { "epoch": 0.4542437447692707, "grad_norm": 1.8926865310221554, "learning_rate": 5.458638803902326e-06, "loss": 0.3676, "step": 7870 }, { "epoch": 0.4548209286889267, "grad_norm": 4.130403133399794, "learning_rate": 5.452866131732379e-06, "loss": 0.3624, "step": 7880 }, { "epoch": 0.4553981126085827, "grad_norm": 3.4241484954051677, "learning_rate": 5.4470934595624316e-06, "loss": 0.3652, "step": 7890 }, { "epoch": 0.4559752965282387, "grad_norm": 7.8305369558715725, "learning_rate": 5.441320787392485e-06, "loss": 0.35, "step": 7900 }, { "epoch": 0.4565524804478947, "grad_norm": 7.1372161394964575, "learning_rate": 5.435548115222537e-06, "loss": 0.3742, "step": 7910 }, { "epoch": 0.4571296643675507, "grad_norm": 20.273585832785447, "learning_rate": 5.42977544305259e-06, "loss": 0.3769, "step": 7920 }, { "epoch": 0.4577068482872067, "grad_norm": 2.794498898766565, "learning_rate": 5.424002770882641e-06, "loss": 0.384, "step": 7930 }, { "epoch": 0.4582840322068627, "grad_norm": 7.066469891261649, "learning_rate": 5.418230098712695e-06, "loss": 0.3795, "step": 7940 }, { "epoch": 0.4588612161265187, "grad_norm": 2.8353456413911737, "learning_rate": 5.412457426542746e-06, "loss": 0.3643, "step": 7950 }, { "epoch": 0.4594384000461747, "grad_norm": 3.0383570429357345, "learning_rate": 5.406684754372799e-06, "loss": 0.3718, "step": 7960 }, { "epoch": 0.4600155839658307, "grad_norm": 3.6164323938018734, "learning_rate": 5.400912082202852e-06, "loss": 0.3849, "step": 7970 }, { "epoch": 0.4605927678854867, "grad_norm": 2.7123845726783262, "learning_rate": 5.3951394100329045e-06, "loss": 0.3683, "step": 7980 }, { "epoch": 0.4611699518051427, "grad_norm": 2.312599361300853, "learning_rate": 5.3893667378629575e-06, "loss": 0.3798, "step": 7990 }, { "epoch": 0.4617471357247987, "grad_norm": 3.251330933463336, "learning_rate": 5.38359406569301e-06, "loss": 0.3587, "step": 8000 }, { "epoch": 0.4623243196444547, "grad_norm": 6.596375932856641, "learning_rate": 5.377821393523063e-06, "loss": 0.3802, "step": 8010 }, { "epoch": 0.4629015035641107, "grad_norm": 1.8050467998180781, "learning_rate": 5.372048721353115e-06, "loss": 0.3712, "step": 8020 }, { "epoch": 0.4634786874837667, "grad_norm": 5.472845808317412, "learning_rate": 5.366276049183168e-06, "loss": 0.377, "step": 8030 }, { "epoch": 0.4640558714034227, "grad_norm": 21.192833022577837, "learning_rate": 5.36050337701322e-06, "loss": 0.3745, "step": 8040 }, { "epoch": 0.4646330553230787, "grad_norm": 2.3954727021255677, "learning_rate": 5.354730704843273e-06, "loss": 0.368, "step": 8050 }, { "epoch": 0.4652102392427347, "grad_norm": 3.6751726559767652, "learning_rate": 5.3489580326733245e-06, "loss": 0.3803, "step": 8060 }, { "epoch": 0.4657874231623907, "grad_norm": 2.6299220528922316, "learning_rate": 5.3431853605033775e-06, "loss": 0.3655, "step": 8070 }, { "epoch": 0.4663646070820467, "grad_norm": 4.147182980327485, "learning_rate": 5.33741268833343e-06, "loss": 0.376, "step": 8080 }, { "epoch": 0.4669417910017027, "grad_norm": 2.3318408925884526, "learning_rate": 5.331640016163483e-06, "loss": 0.3793, "step": 8090 }, { "epoch": 0.4675189749213587, "grad_norm": 2.5630808791681106, "learning_rate": 5.325867343993535e-06, "loss": 0.3841, "step": 8100 }, { "epoch": 0.4680961588410147, "grad_norm": 16.561602524939726, "learning_rate": 5.320094671823588e-06, "loss": 0.3628, "step": 8110 }, { "epoch": 0.4686733427606707, "grad_norm": 2.655435817667697, "learning_rate": 5.31432199965364e-06, "loss": 0.3827, "step": 8120 }, { "epoch": 0.4692505266803267, "grad_norm": 2.082608101455672, "learning_rate": 5.308549327483693e-06, "loss": 0.3726, "step": 8130 }, { "epoch": 0.4698277105999827, "grad_norm": 7.450725164048278, "learning_rate": 5.302776655313745e-06, "loss": 0.3607, "step": 8140 }, { "epoch": 0.4704048945196387, "grad_norm": 3.727260702005544, "learning_rate": 5.297003983143798e-06, "loss": 0.3666, "step": 8150 }, { "epoch": 0.4709820784392947, "grad_norm": 2.9372607445086816, "learning_rate": 5.29123131097385e-06, "loss": 0.3638, "step": 8160 }, { "epoch": 0.4715592623589507, "grad_norm": 3.011083825137573, "learning_rate": 5.2854586388039034e-06, "loss": 0.3711, "step": 8170 }, { "epoch": 0.4721364462786067, "grad_norm": 5.58881805540413, "learning_rate": 5.279685966633955e-06, "loss": 0.3737, "step": 8180 }, { "epoch": 0.4727136301982627, "grad_norm": 1.9307643066917193, "learning_rate": 5.273913294464008e-06, "loss": 0.3633, "step": 8190 }, { "epoch": 0.4732908141179187, "grad_norm": 2.246158235550298, "learning_rate": 5.26814062229406e-06, "loss": 0.3811, "step": 8200 }, { "epoch": 0.4738679980375747, "grad_norm": 5.378095626076901, "learning_rate": 5.262367950124113e-06, "loss": 0.3772, "step": 8210 }, { "epoch": 0.4744451819572307, "grad_norm": 3.594146830036725, "learning_rate": 5.256595277954165e-06, "loss": 0.3689, "step": 8220 }, { "epoch": 0.4750223658768867, "grad_norm": 1.880161955339062, "learning_rate": 5.250822605784218e-06, "loss": 0.3682, "step": 8230 }, { "epoch": 0.4755995497965427, "grad_norm": 3.655512057019781, "learning_rate": 5.24504993361427e-06, "loss": 0.3601, "step": 8240 }, { "epoch": 0.4761767337161987, "grad_norm": 4.5822643890603345, "learning_rate": 5.239277261444323e-06, "loss": 0.3799, "step": 8250 }, { "epoch": 0.4767539176358547, "grad_norm": 10.062266012222976, "learning_rate": 5.2335045892743755e-06, "loss": 0.3651, "step": 8260 }, { "epoch": 0.4773311015555107, "grad_norm": 7.820363388846345, "learning_rate": 5.2277319171044286e-06, "loss": 0.3629, "step": 8270 }, { "epoch": 0.47790828547516667, "grad_norm": 3.04998159803599, "learning_rate": 5.22195924493448e-06, "loss": 0.3693, "step": 8280 }, { "epoch": 0.47848546939482267, "grad_norm": 4.5635676614995475, "learning_rate": 5.216186572764533e-06, "loss": 0.384, "step": 8290 }, { "epoch": 0.47906265331447867, "grad_norm": 2.1736842754902708, "learning_rate": 5.210413900594585e-06, "loss": 0.363, "step": 8300 }, { "epoch": 0.47963983723413467, "grad_norm": 2.5418326594021887, "learning_rate": 5.204641228424638e-06, "loss": 0.3697, "step": 8310 }, { "epoch": 0.48021702115379067, "grad_norm": 2.692093940326692, "learning_rate": 5.19886855625469e-06, "loss": 0.3691, "step": 8320 }, { "epoch": 0.48079420507344667, "grad_norm": 2.4052933532982816, "learning_rate": 5.193095884084743e-06, "loss": 0.3506, "step": 8330 }, { "epoch": 0.48137138899310267, "grad_norm": 6.620567354797733, "learning_rate": 5.1873232119147955e-06, "loss": 0.3838, "step": 8340 }, { "epoch": 0.48194857291275867, "grad_norm": 2.9255268074292555, "learning_rate": 5.1815505397448485e-06, "loss": 0.3694, "step": 8350 }, { "epoch": 0.48252575683241467, "grad_norm": 13.37602989555681, "learning_rate": 5.175777867574901e-06, "loss": 0.3752, "step": 8360 }, { "epoch": 0.48310294075207066, "grad_norm": 3.511794563236054, "learning_rate": 5.170005195404954e-06, "loss": 0.3741, "step": 8370 }, { "epoch": 0.48368012467172666, "grad_norm": 4.7758963928181375, "learning_rate": 5.164232523235006e-06, "loss": 0.369, "step": 8380 }, { "epoch": 0.48425730859138266, "grad_norm": 3.8240515141083002, "learning_rate": 5.158459851065059e-06, "loss": 0.374, "step": 8390 }, { "epoch": 0.48483449251103866, "grad_norm": 8.100590461673363, "learning_rate": 5.15268717889511e-06, "loss": 0.3563, "step": 8400 }, { "epoch": 0.48541167643069466, "grad_norm": 4.117224392256427, "learning_rate": 5.146914506725163e-06, "loss": 0.3626, "step": 8410 }, { "epoch": 0.48598886035035066, "grad_norm": 3.4662871354919904, "learning_rate": 5.141141834555215e-06, "loss": 0.3848, "step": 8420 }, { "epoch": 0.48656604427000666, "grad_norm": 4.959524985987204, "learning_rate": 5.1353691623852684e-06, "loss": 0.3649, "step": 8430 }, { "epoch": 0.48714322818966266, "grad_norm": 5.172300477902163, "learning_rate": 5.129596490215321e-06, "loss": 0.3732, "step": 8440 }, { "epoch": 0.48772041210931866, "grad_norm": 2.996587870660032, "learning_rate": 5.123823818045374e-06, "loss": 0.3544, "step": 8450 }, { "epoch": 0.48829759602897466, "grad_norm": 5.228719957469869, "learning_rate": 5.118051145875426e-06, "loss": 0.3623, "step": 8460 }, { "epoch": 0.48887477994863066, "grad_norm": 7.078111368668544, "learning_rate": 5.112278473705479e-06, "loss": 0.3521, "step": 8470 }, { "epoch": 0.48945196386828665, "grad_norm": 4.585249570356133, "learning_rate": 5.106505801535531e-06, "loss": 0.3602, "step": 8480 }, { "epoch": 0.49002914778794265, "grad_norm": 3.769909535272591, "learning_rate": 5.100733129365584e-06, "loss": 0.3642, "step": 8490 }, { "epoch": 0.49060633170759865, "grad_norm": 5.843171007267111, "learning_rate": 5.094960457195637e-06, "loss": 0.3844, "step": 8500 }, { "epoch": 0.49118351562725465, "grad_norm": 5.011620359523228, "learning_rate": 5.089187785025688e-06, "loss": 0.3748, "step": 8510 }, { "epoch": 0.4917606995469106, "grad_norm": 3.1629027771513667, "learning_rate": 5.083415112855742e-06, "loss": 0.3756, "step": 8520 }, { "epoch": 0.4923378834665666, "grad_norm": 3.4287778879420583, "learning_rate": 5.0776424406857936e-06, "loss": 0.3654, "step": 8530 }, { "epoch": 0.4929150673862226, "grad_norm": 2.3995913819961077, "learning_rate": 5.0718697685158466e-06, "loss": 0.3497, "step": 8540 }, { "epoch": 0.4934922513058786, "grad_norm": 7.033069082305426, "learning_rate": 5.066097096345899e-06, "loss": 0.3679, "step": 8550 }, { "epoch": 0.4940694352255346, "grad_norm": 9.94984806585782, "learning_rate": 5.060324424175952e-06, "loss": 0.3528, "step": 8560 }, { "epoch": 0.4946466191451906, "grad_norm": 3.544058073278801, "learning_rate": 5.054551752006004e-06, "loss": 0.3578, "step": 8570 }, { "epoch": 0.4952238030648466, "grad_norm": 2.5120291222524176, "learning_rate": 5.048779079836057e-06, "loss": 0.3548, "step": 8580 }, { "epoch": 0.4958009869845026, "grad_norm": 3.3522580875814887, "learning_rate": 5.043006407666109e-06, "loss": 0.3606, "step": 8590 }, { "epoch": 0.4963781709041586, "grad_norm": 2.412854685811211, "learning_rate": 5.037233735496162e-06, "loss": 0.3605, "step": 8600 }, { "epoch": 0.4969553548238146, "grad_norm": 2.9209835829832613, "learning_rate": 5.031461063326214e-06, "loss": 0.3566, "step": 8610 }, { "epoch": 0.4975325387434706, "grad_norm": 2.6580624391224568, "learning_rate": 5.025688391156267e-06, "loss": 0.3806, "step": 8620 }, { "epoch": 0.4981097226631266, "grad_norm": 2.9252114550534567, "learning_rate": 5.019915718986319e-06, "loss": 0.3596, "step": 8630 }, { "epoch": 0.4986869065827826, "grad_norm": 2.432042802537537, "learning_rate": 5.014143046816372e-06, "loss": 0.3849, "step": 8640 }, { "epoch": 0.4992640905024386, "grad_norm": 14.195133545125286, "learning_rate": 5.008370374646424e-06, "loss": 0.3602, "step": 8650 }, { "epoch": 0.4998412744220946, "grad_norm": 88.84290269765289, "learning_rate": 5.002597702476477e-06, "loss": 0.3642, "step": 8660 }, { "epoch": 0.5004184583417506, "grad_norm": 2.935701463863382, "learning_rate": 4.996825030306529e-06, "loss": 0.3792, "step": 8670 }, { "epoch": 0.5009956422614066, "grad_norm": 3.1279829631060396, "learning_rate": 4.991052358136581e-06, "loss": 0.3693, "step": 8680 }, { "epoch": 0.5015728261810626, "grad_norm": 10.193114728366995, "learning_rate": 4.985279685966634e-06, "loss": 0.3597, "step": 8690 }, { "epoch": 0.5021500101007186, "grad_norm": 2.4687234809350107, "learning_rate": 4.9795070137966864e-06, "loss": 0.3574, "step": 8700 }, { "epoch": 0.5027271940203746, "grad_norm": 5.072249220362736, "learning_rate": 4.9737343416267395e-06, "loss": 0.3494, "step": 8710 }, { "epoch": 0.5033043779400306, "grad_norm": 2.491352620261712, "learning_rate": 4.967961669456792e-06, "loss": 0.3567, "step": 8720 }, { "epoch": 0.5038815618596866, "grad_norm": 4.883169609666169, "learning_rate": 4.962188997286845e-06, "loss": 0.3563, "step": 8730 }, { "epoch": 0.5044587457793426, "grad_norm": 3.9469009793644623, "learning_rate": 4.956416325116897e-06, "loss": 0.3669, "step": 8740 }, { "epoch": 0.5050359296989986, "grad_norm": 5.325339903718785, "learning_rate": 4.95064365294695e-06, "loss": 0.3479, "step": 8750 }, { "epoch": 0.5056131136186546, "grad_norm": 2.924190713741744, "learning_rate": 4.944870980777002e-06, "loss": 0.3596, "step": 8760 }, { "epoch": 0.5061902975383106, "grad_norm": 29.94785140992617, "learning_rate": 4.939098308607055e-06, "loss": 0.3482, "step": 8770 }, { "epoch": 0.5067674814579666, "grad_norm": 4.600461351076627, "learning_rate": 4.933325636437107e-06, "loss": 0.3676, "step": 8780 }, { "epoch": 0.5073446653776226, "grad_norm": 3.298079688278539, "learning_rate": 4.92755296426716e-06, "loss": 0.3849, "step": 8790 }, { "epoch": 0.5079218492972786, "grad_norm": 3.3537368346024903, "learning_rate": 4.921780292097212e-06, "loss": 0.354, "step": 8800 }, { "epoch": 0.5084990332169346, "grad_norm": 4.562710088882002, "learning_rate": 4.9160076199272646e-06, "loss": 0.3762, "step": 8810 }, { "epoch": 0.5090762171365906, "grad_norm": 3.600922103480094, "learning_rate": 4.910234947757318e-06, "loss": 0.3709, "step": 8820 }, { "epoch": 0.5096534010562466, "grad_norm": 5.507110655352795, "learning_rate": 4.90446227558737e-06, "loss": 0.35, "step": 8830 }, { "epoch": 0.5102305849759026, "grad_norm": 14.876481920624023, "learning_rate": 4.898689603417423e-06, "loss": 0.3655, "step": 8840 }, { "epoch": 0.5108077688955586, "grad_norm": 6.040943909655418, "learning_rate": 4.892916931247475e-06, "loss": 0.3537, "step": 8850 }, { "epoch": 0.5113849528152146, "grad_norm": 4.784882455474531, "learning_rate": 4.887144259077527e-06, "loss": 0.365, "step": 8860 }, { "epoch": 0.5119621367348706, "grad_norm": 3.5957944832471864, "learning_rate": 4.88137158690758e-06, "loss": 0.3649, "step": 8870 }, { "epoch": 0.5125393206545266, "grad_norm": 7.656707006499249, "learning_rate": 4.875598914737632e-06, "loss": 0.3678, "step": 8880 }, { "epoch": 0.5131165045741826, "grad_norm": 3.6610216347500666, "learning_rate": 4.869826242567685e-06, "loss": 0.3659, "step": 8890 }, { "epoch": 0.5136936884938386, "grad_norm": 4.181649664719206, "learning_rate": 4.8640535703977375e-06, "loss": 0.358, "step": 8900 }, { "epoch": 0.5142708724134946, "grad_norm": 1.8612925513884986, "learning_rate": 4.8582808982277905e-06, "loss": 0.3508, "step": 8910 }, { "epoch": 0.5148480563331506, "grad_norm": 5.0292268546846195, "learning_rate": 4.852508226057843e-06, "loss": 0.3567, "step": 8920 }, { "epoch": 0.5154252402528066, "grad_norm": 5.77083593828813, "learning_rate": 4.846735553887895e-06, "loss": 0.3706, "step": 8930 }, { "epoch": 0.5160024241724626, "grad_norm": 3.1575366683166264, "learning_rate": 4.840962881717948e-06, "loss": 0.3603, "step": 8940 }, { "epoch": 0.5165796080921186, "grad_norm": 3.0092615460602357, "learning_rate": 4.835190209548e-06, "loss": 0.3567, "step": 8950 }, { "epoch": 0.5171567920117746, "grad_norm": 5.424773899652038, "learning_rate": 4.829417537378053e-06, "loss": 0.3569, "step": 8960 }, { "epoch": 0.5177339759314306, "grad_norm": 4.0199024195308075, "learning_rate": 4.823644865208105e-06, "loss": 0.376, "step": 8970 }, { "epoch": 0.5183111598510866, "grad_norm": 8.92239001104145, "learning_rate": 4.8178721930381575e-06, "loss": 0.3572, "step": 8980 }, { "epoch": 0.5188883437707426, "grad_norm": 10.702196683213096, "learning_rate": 4.8120995208682105e-06, "loss": 0.3651, "step": 8990 }, { "epoch": 0.5194655276903986, "grad_norm": 4.470162551927128, "learning_rate": 4.806326848698263e-06, "loss": 0.3654, "step": 9000 }, { "epoch": 0.5200427116100546, "grad_norm": 2.1821708334970737, "learning_rate": 4.800554176528316e-06, "loss": 0.3516, "step": 9010 }, { "epoch": 0.5206198955297106, "grad_norm": 5.347973377285275, "learning_rate": 4.794781504358368e-06, "loss": 0.3623, "step": 9020 }, { "epoch": 0.5211970794493666, "grad_norm": 4.623834142826691, "learning_rate": 4.78900883218842e-06, "loss": 0.3727, "step": 9030 }, { "epoch": 0.5217742633690226, "grad_norm": 5.255808460960779, "learning_rate": 4.783236160018473e-06, "loss": 0.3729, "step": 9040 }, { "epoch": 0.5223514472886785, "grad_norm": 3.1254534476864215, "learning_rate": 4.777463487848525e-06, "loss": 0.3679, "step": 9050 }, { "epoch": 0.5229286312083345, "grad_norm": 9.646051621259671, "learning_rate": 4.771690815678578e-06, "loss": 0.3834, "step": 9060 }, { "epoch": 0.5235058151279905, "grad_norm": 4.490473334084667, "learning_rate": 4.76591814350863e-06, "loss": 0.3656, "step": 9070 }, { "epoch": 0.5240829990476465, "grad_norm": 2.607385547296513, "learning_rate": 4.760145471338683e-06, "loss": 0.3861, "step": 9080 }, { "epoch": 0.5246601829673025, "grad_norm": 3.7754579602738136, "learning_rate": 4.754372799168736e-06, "loss": 0.3506, "step": 9090 }, { "epoch": 0.5252373668869585, "grad_norm": 2.8550942295775896, "learning_rate": 4.748600126998788e-06, "loss": 0.3567, "step": 9100 }, { "epoch": 0.5258145508066145, "grad_norm": 1.7112590538881849, "learning_rate": 4.742827454828841e-06, "loss": 0.3604, "step": 9110 }, { "epoch": 0.5263917347262705, "grad_norm": 2.7066239753400585, "learning_rate": 4.737054782658893e-06, "loss": 0.36, "step": 9120 }, { "epoch": 0.5269689186459265, "grad_norm": 5.758530747558061, "learning_rate": 4.731282110488946e-06, "loss": 0.3634, "step": 9130 }, { "epoch": 0.5275461025655825, "grad_norm": 210.77018196053547, "learning_rate": 4.725509438318998e-06, "loss": 0.3722, "step": 9140 }, { "epoch": 0.5281232864852385, "grad_norm": 2.300368854843036, "learning_rate": 4.71973676614905e-06, "loss": 0.3729, "step": 9150 }, { "epoch": 0.5287004704048945, "grad_norm": 2.210540304841504, "learning_rate": 4.713964093979103e-06, "loss": 0.3796, "step": 9160 }, { "epoch": 0.5292776543245505, "grad_norm": 2.2634685628485145, "learning_rate": 4.7081914218091555e-06, "loss": 0.356, "step": 9170 }, { "epoch": 0.5298548382442065, "grad_norm": 3.1055746156661614, "learning_rate": 4.7024187496392085e-06, "loss": 0.3582, "step": 9180 }, { "epoch": 0.5304320221638625, "grad_norm": 3.654168505440511, "learning_rate": 4.696646077469261e-06, "loss": 0.3607, "step": 9190 }, { "epoch": 0.5310092060835185, "grad_norm": 3.0537549885812347, "learning_rate": 4.690873405299313e-06, "loss": 0.3767, "step": 9200 }, { "epoch": 0.5315863900031745, "grad_norm": 1.9675795559729068, "learning_rate": 4.685100733129366e-06, "loss": 0.3657, "step": 9210 }, { "epoch": 0.5321635739228305, "grad_norm": 2.0115039115011606, "learning_rate": 4.679328060959418e-06, "loss": 0.3601, "step": 9220 }, { "epoch": 0.5327407578424865, "grad_norm": 5.152045183089219, "learning_rate": 4.673555388789471e-06, "loss": 0.3874, "step": 9230 }, { "epoch": 0.5333179417621425, "grad_norm": 3.394195501156254, "learning_rate": 4.667782716619523e-06, "loss": 0.3687, "step": 9240 }, { "epoch": 0.5338951256817985, "grad_norm": 3.733950998673983, "learning_rate": 4.6620100444495755e-06, "loss": 0.3812, "step": 9250 }, { "epoch": 0.5344723096014545, "grad_norm": 2.172335452644072, "learning_rate": 4.6562373722796285e-06, "loss": 0.3719, "step": 9260 }, { "epoch": 0.5350494935211105, "grad_norm": 1.8466321499683245, "learning_rate": 4.650464700109681e-06, "loss": 0.3563, "step": 9270 }, { "epoch": 0.5356266774407665, "grad_norm": 2.411259261337378, "learning_rate": 4.644692027939734e-06, "loss": 0.3652, "step": 9280 }, { "epoch": 0.5362038613604225, "grad_norm": 2.525799052658632, "learning_rate": 4.638919355769787e-06, "loss": 0.357, "step": 9290 }, { "epoch": 0.5367810452800785, "grad_norm": 4.322100164387683, "learning_rate": 4.633146683599839e-06, "loss": 0.344, "step": 9300 }, { "epoch": 0.5373582291997345, "grad_norm": 2.9720413934133267, "learning_rate": 4.627374011429892e-06, "loss": 0.3591, "step": 9310 }, { "epoch": 0.5379354131193905, "grad_norm": 3.236889177938413, "learning_rate": 4.621601339259944e-06, "loss": 0.3698, "step": 9320 }, { "epoch": 0.5385125970390465, "grad_norm": 4.173966589558208, "learning_rate": 4.615828667089996e-06, "loss": 0.3687, "step": 9330 }, { "epoch": 0.5390897809587025, "grad_norm": 3.3050072649704387, "learning_rate": 4.610055994920049e-06, "loss": 0.3559, "step": 9340 }, { "epoch": 0.5396669648783585, "grad_norm": 2.2463012820732904, "learning_rate": 4.6042833227501014e-06, "loss": 0.3585, "step": 9350 }, { "epoch": 0.5402441487980145, "grad_norm": 2.3061353512132357, "learning_rate": 4.5985106505801544e-06, "loss": 0.3666, "step": 9360 }, { "epoch": 0.5408213327176705, "grad_norm": 2.8767755309606393, "learning_rate": 4.592737978410207e-06, "loss": 0.3656, "step": 9370 }, { "epoch": 0.5413985166373265, "grad_norm": 3.5885590608959603, "learning_rate": 4.586965306240259e-06, "loss": 0.3619, "step": 9380 }, { "epoch": 0.5419757005569825, "grad_norm": 3.012510637436092, "learning_rate": 4.581192634070312e-06, "loss": 0.3679, "step": 9390 }, { "epoch": 0.5425528844766385, "grad_norm": 2.074304355176205, "learning_rate": 4.575419961900364e-06, "loss": 0.3558, "step": 9400 }, { "epoch": 0.5431300683962945, "grad_norm": 2.410649696384616, "learning_rate": 4.569647289730417e-06, "loss": 0.3667, "step": 9410 }, { "epoch": 0.5437072523159505, "grad_norm": 6.990964309593162, "learning_rate": 4.563874617560469e-06, "loss": 0.3544, "step": 9420 }, { "epoch": 0.5442844362356065, "grad_norm": 1.6679890672242221, "learning_rate": 4.558101945390521e-06, "loss": 0.3635, "step": 9430 }, { "epoch": 0.5448616201552625, "grad_norm": 3.5900096135866177, "learning_rate": 4.552329273220574e-06, "loss": 0.3548, "step": 9440 }, { "epoch": 0.5454388040749185, "grad_norm": 2.8054899052225655, "learning_rate": 4.5465566010506266e-06, "loss": 0.3573, "step": 9450 }, { "epoch": 0.5460159879945745, "grad_norm": 2.7792687957236315, "learning_rate": 4.5407839288806796e-06, "loss": 0.3677, "step": 9460 }, { "epoch": 0.5465931719142305, "grad_norm": 2.2614901204437636, "learning_rate": 4.535011256710732e-06, "loss": 0.3643, "step": 9470 }, { "epoch": 0.5471703558338865, "grad_norm": 2.062841207589413, "learning_rate": 4.529238584540785e-06, "loss": 0.3592, "step": 9480 }, { "epoch": 0.5477475397535425, "grad_norm": 8.866919074207425, "learning_rate": 4.523465912370837e-06, "loss": 0.3552, "step": 9490 }, { "epoch": 0.5483247236731985, "grad_norm": 2.5645835670129618, "learning_rate": 4.517693240200889e-06, "loss": 0.3498, "step": 9500 }, { "epoch": 0.5489019075928545, "grad_norm": 3.1741970680335716, "learning_rate": 4.511920568030942e-06, "loss": 0.3682, "step": 9510 }, { "epoch": 0.5494790915125105, "grad_norm": 2.0747984580427765, "learning_rate": 4.506147895860994e-06, "loss": 0.3646, "step": 9520 }, { "epoch": 0.5500562754321665, "grad_norm": 2.3586125635742654, "learning_rate": 4.500375223691047e-06, "loss": 0.355, "step": 9530 }, { "epoch": 0.5506334593518225, "grad_norm": 3.8182790115085927, "learning_rate": 4.4946025515210995e-06, "loss": 0.3528, "step": 9540 }, { "epoch": 0.5512106432714785, "grad_norm": 2.623243257421812, "learning_rate": 4.488829879351152e-06, "loss": 0.3551, "step": 9550 }, { "epoch": 0.5517878271911345, "grad_norm": 2.775469371839904, "learning_rate": 4.483057207181205e-06, "loss": 0.3556, "step": 9560 }, { "epoch": 0.5523650111107905, "grad_norm": 2.345529859871698, "learning_rate": 4.477284535011257e-06, "loss": 0.3702, "step": 9570 }, { "epoch": 0.5529421950304465, "grad_norm": 5.265133567547254, "learning_rate": 4.47151186284131e-06, "loss": 0.3617, "step": 9580 }, { "epoch": 0.5535193789501025, "grad_norm": 3.1602517841252897, "learning_rate": 4.465739190671362e-06, "loss": 0.3524, "step": 9590 }, { "epoch": 0.5540965628697585, "grad_norm": 1.8906279954560556, "learning_rate": 4.459966518501414e-06, "loss": 0.3584, "step": 9600 }, { "epoch": 0.5546737467894145, "grad_norm": 3.4743194872868117, "learning_rate": 4.454193846331467e-06, "loss": 0.3531, "step": 9610 }, { "epoch": 0.5552509307090705, "grad_norm": 2.9951619722989578, "learning_rate": 4.4484211741615194e-06, "loss": 0.3525, "step": 9620 }, { "epoch": 0.5558281146287264, "grad_norm": 5.694532155563467, "learning_rate": 4.4426485019915725e-06, "loss": 0.3568, "step": 9630 }, { "epoch": 0.5564052985483824, "grad_norm": 2.0095159465987398, "learning_rate": 4.436875829821625e-06, "loss": 0.3586, "step": 9640 }, { "epoch": 0.5569824824680384, "grad_norm": 2.703106746299796, "learning_rate": 4.431103157651677e-06, "loss": 0.3586, "step": 9650 }, { "epoch": 0.5575596663876944, "grad_norm": 4.366070322479832, "learning_rate": 4.42533048548173e-06, "loss": 0.3628, "step": 9660 }, { "epoch": 0.5581368503073504, "grad_norm": 5.959735845454689, "learning_rate": 4.419557813311782e-06, "loss": 0.3453, "step": 9670 }, { "epoch": 0.5587140342270064, "grad_norm": 3.887262505871754, "learning_rate": 4.413785141141835e-06, "loss": 0.3579, "step": 9680 }, { "epoch": 0.5592912181466624, "grad_norm": 3.5847585820260273, "learning_rate": 4.408012468971887e-06, "loss": 0.353, "step": 9690 }, { "epoch": 0.5598684020663184, "grad_norm": 11.183076093226834, "learning_rate": 4.40223979680194e-06, "loss": 0.354, "step": 9700 }, { "epoch": 0.5604455859859744, "grad_norm": 8.751616032322445, "learning_rate": 4.396467124631992e-06, "loss": 0.358, "step": 9710 }, { "epoch": 0.5610227699056304, "grad_norm": 8.759188394381018, "learning_rate": 4.3906944524620446e-06, "loss": 0.3477, "step": 9720 }, { "epoch": 0.5615999538252864, "grad_norm": 3.185742989121124, "learning_rate": 4.3849217802920976e-06, "loss": 0.3527, "step": 9730 }, { "epoch": 0.5621771377449424, "grad_norm": 2.6034839783878847, "learning_rate": 4.37914910812215e-06, "loss": 0.3614, "step": 9740 }, { "epoch": 0.5627543216645984, "grad_norm": 3.8176039303217943, "learning_rate": 4.373376435952203e-06, "loss": 0.3629, "step": 9750 }, { "epoch": 0.5633315055842544, "grad_norm": 2.892391389713988, "learning_rate": 4.367603763782255e-06, "loss": 0.3543, "step": 9760 }, { "epoch": 0.5639086895039104, "grad_norm": 3.756297203577958, "learning_rate": 4.361831091612307e-06, "loss": 0.3618, "step": 9770 }, { "epoch": 0.5644858734235664, "grad_norm": 5.678261576873807, "learning_rate": 4.35605841944236e-06, "loss": 0.3568, "step": 9780 }, { "epoch": 0.5650630573432224, "grad_norm": 8.441638312518547, "learning_rate": 4.350285747272412e-06, "loss": 0.3729, "step": 9790 }, { "epoch": 0.5656402412628784, "grad_norm": 11.963283236920255, "learning_rate": 4.344513075102465e-06, "loss": 0.3515, "step": 9800 }, { "epoch": 0.5662174251825344, "grad_norm": 5.191809552146641, "learning_rate": 4.338740402932518e-06, "loss": 0.3555, "step": 9810 }, { "epoch": 0.5667946091021904, "grad_norm": 4.402544511071244, "learning_rate": 4.3329677307625705e-06, "loss": 0.3593, "step": 9820 }, { "epoch": 0.5673717930218464, "grad_norm": 3.877667010250195, "learning_rate": 4.327195058592623e-06, "loss": 0.3671, "step": 9830 }, { "epoch": 0.5679489769415024, "grad_norm": 3.6190457236390907, "learning_rate": 4.321422386422676e-06, "loss": 0.3821, "step": 9840 }, { "epoch": 0.5685261608611584, "grad_norm": 6.877296061159591, "learning_rate": 4.315649714252728e-06, "loss": 0.3603, "step": 9850 }, { "epoch": 0.5691033447808144, "grad_norm": 4.590765318427339, "learning_rate": 4.309877042082781e-06, "loss": 0.3713, "step": 9860 }, { "epoch": 0.5696805287004704, "grad_norm": 4.474831184892119, "learning_rate": 4.304104369912833e-06, "loss": 0.3475, "step": 9870 }, { "epoch": 0.5702577126201264, "grad_norm": 6.348172468881283, "learning_rate": 4.298331697742886e-06, "loss": 0.3498, "step": 9880 }, { "epoch": 0.5708348965397824, "grad_norm": 3.2161248922526906, "learning_rate": 4.292559025572938e-06, "loss": 0.3548, "step": 9890 }, { "epoch": 0.5714120804594384, "grad_norm": 3.6712025528595476, "learning_rate": 4.2867863534029905e-06, "loss": 0.3698, "step": 9900 }, { "epoch": 0.5719892643790944, "grad_norm": 6.372020331564792, "learning_rate": 4.2810136812330435e-06, "loss": 0.3774, "step": 9910 }, { "epoch": 0.5725664482987504, "grad_norm": 7.0760072190976055, "learning_rate": 4.275241009063096e-06, "loss": 0.366, "step": 9920 }, { "epoch": 0.5731436322184064, "grad_norm": 4.330191932801956, "learning_rate": 4.269468336893149e-06, "loss": 0.3659, "step": 9930 }, { "epoch": 0.5737208161380624, "grad_norm": 3.7388442828506183, "learning_rate": 4.263695664723201e-06, "loss": 0.3585, "step": 9940 }, { "epoch": 0.5742980000577184, "grad_norm": 3.788666515258982, "learning_rate": 4.257922992553253e-06, "loss": 0.3624, "step": 9950 }, { "epoch": 0.5748751839773744, "grad_norm": 3.882574363808373, "learning_rate": 4.252150320383306e-06, "loss": 0.359, "step": 9960 }, { "epoch": 0.5754523678970304, "grad_norm": 3.3860959596594764, "learning_rate": 4.246377648213358e-06, "loss": 0.3753, "step": 9970 }, { "epoch": 0.5760295518166864, "grad_norm": 3.0843751033026936, "learning_rate": 4.240604976043411e-06, "loss": 0.3623, "step": 9980 }, { "epoch": 0.5766067357363424, "grad_norm": 4.2670578544960165, "learning_rate": 4.234832303873463e-06, "loss": 0.3583, "step": 9990 }, { "epoch": 0.5771839196559984, "grad_norm": 4.252817625823887, "learning_rate": 4.229059631703516e-06, "loss": 0.3576, "step": 10000 }, { "epoch": 0.5777611035756544, "grad_norm": 3.407625305823021, "learning_rate": 4.223286959533569e-06, "loss": 0.3552, "step": 10010 }, { "epoch": 0.5783382874953104, "grad_norm": 3.1679099104052484, "learning_rate": 4.217514287363621e-06, "loss": 0.3738, "step": 10020 }, { "epoch": 0.5789154714149664, "grad_norm": 3.6787102703539443, "learning_rate": 4.211741615193674e-06, "loss": 0.3578, "step": 10030 }, { "epoch": 0.5794926553346224, "grad_norm": 5.851925555703329, "learning_rate": 4.205968943023726e-06, "loss": 0.3419, "step": 10040 }, { "epoch": 0.5800698392542784, "grad_norm": 4.329132073061233, "learning_rate": 4.200196270853779e-06, "loss": 0.3643, "step": 10050 }, { "epoch": 0.5806470231739344, "grad_norm": 5.997643376925449, "learning_rate": 4.194423598683831e-06, "loss": 0.377, "step": 10060 }, { "epoch": 0.5812242070935904, "grad_norm": 2.847934197964713, "learning_rate": 4.188650926513883e-06, "loss": 0.3637, "step": 10070 }, { "epoch": 0.5818013910132463, "grad_norm": 5.914457578502053, "learning_rate": 4.182878254343936e-06, "loss": 0.3674, "step": 10080 }, { "epoch": 0.5823785749329023, "grad_norm": 2.717688952876076, "learning_rate": 4.1771055821739885e-06, "loss": 0.3445, "step": 10090 }, { "epoch": 0.5829557588525583, "grad_norm": 4.506246357458738, "learning_rate": 4.1713329100040415e-06, "loss": 0.3596, "step": 10100 }, { "epoch": 0.5835329427722143, "grad_norm": 4.390097302175211, "learning_rate": 4.165560237834094e-06, "loss": 0.3502, "step": 10110 }, { "epoch": 0.5841101266918703, "grad_norm": 61.470288646964526, "learning_rate": 4.159787565664146e-06, "loss": 0.364, "step": 10120 }, { "epoch": 0.5846873106115263, "grad_norm": 4.249154244474576, "learning_rate": 4.154014893494199e-06, "loss": 0.3494, "step": 10130 }, { "epoch": 0.5852644945311823, "grad_norm": 2.6108964866695956, "learning_rate": 4.148242221324251e-06, "loss": 0.3417, "step": 10140 }, { "epoch": 0.5858416784508383, "grad_norm": 3.655089863468255, "learning_rate": 4.142469549154304e-06, "loss": 0.3576, "step": 10150 }, { "epoch": 0.5864188623704943, "grad_norm": 19.653637032520724, "learning_rate": 4.136696876984356e-06, "loss": 0.3603, "step": 10160 }, { "epoch": 0.5869960462901503, "grad_norm": 4.623567506823282, "learning_rate": 4.1309242048144085e-06, "loss": 0.3486, "step": 10170 }, { "epoch": 0.5875732302098063, "grad_norm": 5.14547645262892, "learning_rate": 4.1251515326444615e-06, "loss": 0.3591, "step": 10180 }, { "epoch": 0.5881504141294623, "grad_norm": 20.59966690800242, "learning_rate": 4.119378860474514e-06, "loss": 0.3473, "step": 10190 }, { "epoch": 0.5887275980491183, "grad_norm": 3.3637726076066796, "learning_rate": 4.113606188304567e-06, "loss": 0.3706, "step": 10200 }, { "epoch": 0.5893047819687743, "grad_norm": 4.061688985881421, "learning_rate": 4.107833516134619e-06, "loss": 0.3694, "step": 10210 }, { "epoch": 0.5898819658884303, "grad_norm": 4.941411464322626, "learning_rate": 4.102060843964671e-06, "loss": 0.3551, "step": 10220 }, { "epoch": 0.5904591498080863, "grad_norm": 4.631050291252514, "learning_rate": 4.096288171794724e-06, "loss": 0.3669, "step": 10230 }, { "epoch": 0.5910363337277423, "grad_norm": 5.833635533863073, "learning_rate": 4.090515499624776e-06, "loss": 0.354, "step": 10240 }, { "epoch": 0.5916135176473983, "grad_norm": 17.060908158433886, "learning_rate": 4.084742827454829e-06, "loss": 0.355, "step": 10250 }, { "epoch": 0.5921907015670543, "grad_norm": 4.438473043902829, "learning_rate": 4.078970155284881e-06, "loss": 0.3559, "step": 10260 }, { "epoch": 0.5927678854867103, "grad_norm": 4.029441903808598, "learning_rate": 4.0731974831149344e-06, "loss": 0.3456, "step": 10270 }, { "epoch": 0.5933450694063663, "grad_norm": 4.894018752470052, "learning_rate": 4.067424810944987e-06, "loss": 0.342, "step": 10280 }, { "epoch": 0.5939222533260223, "grad_norm": 2.403763740670601, "learning_rate": 4.061652138775039e-06, "loss": 0.3653, "step": 10290 }, { "epoch": 0.5944994372456783, "grad_norm": 4.283731997785048, "learning_rate": 4.055879466605092e-06, "loss": 0.3668, "step": 10300 }, { "epoch": 0.5950766211653343, "grad_norm": 3.021670467218982, "learning_rate": 4.050106794435144e-06, "loss": 0.3554, "step": 10310 }, { "epoch": 0.5956538050849903, "grad_norm": 39.77509257463927, "learning_rate": 4.044334122265197e-06, "loss": 0.373, "step": 10320 }, { "epoch": 0.5962309890046463, "grad_norm": 7.734517576459877, "learning_rate": 4.03856145009525e-06, "loss": 0.3545, "step": 10330 }, { "epoch": 0.5968081729243023, "grad_norm": 12.164334264143207, "learning_rate": 4.032788777925302e-06, "loss": 0.3593, "step": 10340 }, { "epoch": 0.5973853568439583, "grad_norm": 6.962481288991228, "learning_rate": 4.027016105755354e-06, "loss": 0.3424, "step": 10350 }, { "epoch": 0.5979625407636143, "grad_norm": 3.265508104782265, "learning_rate": 4.021243433585407e-06, "loss": 0.3401, "step": 10360 }, { "epoch": 0.5985397246832703, "grad_norm": 3.169522843364254, "learning_rate": 4.0154707614154596e-06, "loss": 0.3534, "step": 10370 }, { "epoch": 0.5991169086029263, "grad_norm": 3.2365860904520662, "learning_rate": 4.0096980892455126e-06, "loss": 0.3476, "step": 10380 }, { "epoch": 0.5996940925225823, "grad_norm": 22.34179305900222, "learning_rate": 4.003925417075565e-06, "loss": 0.3615, "step": 10390 }, { "epoch": 0.6002712764422383, "grad_norm": 3.6824711768745235, "learning_rate": 3.998152744905617e-06, "loss": 0.3598, "step": 10400 }, { "epoch": 0.6008484603618943, "grad_norm": 2.244270658998301, "learning_rate": 3.99238007273567e-06, "loss": 0.3602, "step": 10410 }, { "epoch": 0.6014256442815503, "grad_norm": 2.944264669766013, "learning_rate": 3.986607400565722e-06, "loss": 0.3557, "step": 10420 }, { "epoch": 0.6020028282012063, "grad_norm": 3.6813564246612893, "learning_rate": 3.980834728395775e-06, "loss": 0.3426, "step": 10430 }, { "epoch": 0.6025800121208623, "grad_norm": 2.4642472523118193, "learning_rate": 3.975062056225827e-06, "loss": 0.3498, "step": 10440 }, { "epoch": 0.6031571960405183, "grad_norm": 2.612111770933025, "learning_rate": 3.96928938405588e-06, "loss": 0.3738, "step": 10450 }, { "epoch": 0.6037343799601743, "grad_norm": 4.989290119921459, "learning_rate": 3.9635167118859325e-06, "loss": 0.3437, "step": 10460 }, { "epoch": 0.6043115638798303, "grad_norm": 3.191111659552641, "learning_rate": 3.957744039715985e-06, "loss": 0.3513, "step": 10470 }, { "epoch": 0.6048887477994863, "grad_norm": 6.289514020679802, "learning_rate": 3.951971367546038e-06, "loss": 0.3617, "step": 10480 }, { "epoch": 0.6054659317191423, "grad_norm": 4.369159045847553, "learning_rate": 3.94619869537609e-06, "loss": 0.3622, "step": 10490 }, { "epoch": 0.6060431156387983, "grad_norm": 4.272747590300094, "learning_rate": 3.940426023206143e-06, "loss": 0.3679, "step": 10500 }, { "epoch": 0.6066202995584543, "grad_norm": 6.298266009612924, "learning_rate": 3.934653351036195e-06, "loss": 0.3605, "step": 10510 }, { "epoch": 0.6071974834781103, "grad_norm": 2.853912711053667, "learning_rate": 3.928880678866247e-06, "loss": 0.3643, "step": 10520 }, { "epoch": 0.6077746673977663, "grad_norm": 4.905191894605176, "learning_rate": 3.9231080066963e-06, "loss": 0.3566, "step": 10530 }, { "epoch": 0.6083518513174223, "grad_norm": 3.7179571361092307, "learning_rate": 3.9173353345263524e-06, "loss": 0.3534, "step": 10540 }, { "epoch": 0.6089290352370783, "grad_norm": 4.720816418264325, "learning_rate": 3.9115626623564055e-06, "loss": 0.3445, "step": 10550 }, { "epoch": 0.6095062191567343, "grad_norm": 6.0994643547541045, "learning_rate": 3.905789990186458e-06, "loss": 0.3447, "step": 10560 }, { "epoch": 0.6100834030763903, "grad_norm": 7.143461522640564, "learning_rate": 3.90001731801651e-06, "loss": 0.3486, "step": 10570 }, { "epoch": 0.6106605869960463, "grad_norm": 3.5865672738484515, "learning_rate": 3.894244645846563e-06, "loss": 0.3543, "step": 10580 }, { "epoch": 0.6112377709157023, "grad_norm": 3.44671753994167, "learning_rate": 3.888471973676615e-06, "loss": 0.339, "step": 10590 }, { "epoch": 0.6118149548353583, "grad_norm": 4.037111129069171, "learning_rate": 3.882699301506668e-06, "loss": 0.3542, "step": 10600 }, { "epoch": 0.6123921387550143, "grad_norm": 2.5068462700876752, "learning_rate": 3.87692662933672e-06, "loss": 0.3612, "step": 10610 }, { "epoch": 0.6129693226746703, "grad_norm": 2.69916896955261, "learning_rate": 3.871153957166772e-06, "loss": 0.3552, "step": 10620 }, { "epoch": 0.6135465065943263, "grad_norm": 2.12828690128291, "learning_rate": 3.865381284996825e-06, "loss": 0.3464, "step": 10630 }, { "epoch": 0.6141236905139823, "grad_norm": 2.4651478648163754, "learning_rate": 3.8596086128268776e-06, "loss": 0.3714, "step": 10640 }, { "epoch": 0.6147008744336383, "grad_norm": 4.709377859928187, "learning_rate": 3.853835940656931e-06, "loss": 0.3462, "step": 10650 }, { "epoch": 0.6152780583532943, "grad_norm": 14.878118519317356, "learning_rate": 3.848063268486983e-06, "loss": 0.355, "step": 10660 }, { "epoch": 0.6158552422729503, "grad_norm": 7.0013585156765314, "learning_rate": 3.842290596317036e-06, "loss": 0.3548, "step": 10670 }, { "epoch": 0.6164324261926063, "grad_norm": 4.781645383408167, "learning_rate": 3.836517924147088e-06, "loss": 0.3544, "step": 10680 }, { "epoch": 0.6170096101122623, "grad_norm": 2.5404756093298695, "learning_rate": 3.83074525197714e-06, "loss": 0.3534, "step": 10690 }, { "epoch": 0.6175867940319183, "grad_norm": 8.95780042415011, "learning_rate": 3.824972579807193e-06, "loss": 0.3636, "step": 10700 }, { "epoch": 0.6181639779515743, "grad_norm": 4.989641662422552, "learning_rate": 3.819199907637245e-06, "loss": 0.3673, "step": 10710 }, { "epoch": 0.6187411618712303, "grad_norm": 3.0860360298250096, "learning_rate": 3.813427235467298e-06, "loss": 0.3454, "step": 10720 }, { "epoch": 0.6193183457908863, "grad_norm": 5.75583484430528, "learning_rate": 3.8076545632973505e-06, "loss": 0.3598, "step": 10730 }, { "epoch": 0.6198955297105423, "grad_norm": 2.949832246732552, "learning_rate": 3.801881891127403e-06, "loss": 0.3553, "step": 10740 }, { "epoch": 0.6204727136301983, "grad_norm": 5.670166660495844, "learning_rate": 3.7961092189574557e-06, "loss": 0.3626, "step": 10750 }, { "epoch": 0.6210498975498543, "grad_norm": 3.972108943307402, "learning_rate": 3.790336546787508e-06, "loss": 0.3473, "step": 10760 }, { "epoch": 0.6216270814695103, "grad_norm": 6.588272622863319, "learning_rate": 3.7845638746175605e-06, "loss": 0.3504, "step": 10770 }, { "epoch": 0.6222042653891663, "grad_norm": 10.062313562042537, "learning_rate": 3.778791202447613e-06, "loss": 0.348, "step": 10780 }, { "epoch": 0.6227814493088223, "grad_norm": 2.393822027910724, "learning_rate": 3.7730185302776657e-06, "loss": 0.332, "step": 10790 }, { "epoch": 0.6233586332284783, "grad_norm": 3.726451963447983, "learning_rate": 3.7672458581077183e-06, "loss": 0.3544, "step": 10800 }, { "epoch": 0.6239358171481343, "grad_norm": 3.0713208436951405, "learning_rate": 3.761473185937771e-06, "loss": 0.3569, "step": 10810 }, { "epoch": 0.6245130010677903, "grad_norm": 8.506793083475245, "learning_rate": 3.755700513767823e-06, "loss": 0.3498, "step": 10820 }, { "epoch": 0.6250901849874463, "grad_norm": 5.0436471202946205, "learning_rate": 3.7499278415978756e-06, "loss": 0.3539, "step": 10830 }, { "epoch": 0.6256673689071023, "grad_norm": 3.0241635545445957, "learning_rate": 3.7441551694279282e-06, "loss": 0.3623, "step": 10840 }, { "epoch": 0.6262445528267583, "grad_norm": 2.4939286036772703, "learning_rate": 3.7383824972579812e-06, "loss": 0.3578, "step": 10850 }, { "epoch": 0.6268217367464143, "grad_norm": 3.3678230813106373, "learning_rate": 3.732609825088034e-06, "loss": 0.3568, "step": 10860 }, { "epoch": 0.6273989206660703, "grad_norm": 3.1630251340425795, "learning_rate": 3.7268371529180864e-06, "loss": 0.3488, "step": 10870 }, { "epoch": 0.6279761045857263, "grad_norm": 4.715668356612278, "learning_rate": 3.721064480748139e-06, "loss": 0.3487, "step": 10880 }, { "epoch": 0.6285532885053823, "grad_norm": 3.430089287433023, "learning_rate": 3.7152918085781912e-06, "loss": 0.3676, "step": 10890 }, { "epoch": 0.6291304724250383, "grad_norm": 2.9339747157831546, "learning_rate": 3.709519136408244e-06, "loss": 0.3507, "step": 10900 }, { "epoch": 0.6297076563446943, "grad_norm": 4.2204605393920485, "learning_rate": 3.7037464642382964e-06, "loss": 0.3446, "step": 10910 }, { "epoch": 0.6302848402643503, "grad_norm": 2.5323029057405764, "learning_rate": 3.697973792068349e-06, "loss": 0.3507, "step": 10920 }, { "epoch": 0.6308620241840063, "grad_norm": 1.959685524861653, "learning_rate": 3.6922011198984016e-06, "loss": 0.3551, "step": 10930 }, { "epoch": 0.6314392081036623, "grad_norm": 2.2201928530131085, "learning_rate": 3.6864284477284538e-06, "loss": 0.3476, "step": 10940 }, { "epoch": 0.6320163920233183, "grad_norm": 2.5294366254069645, "learning_rate": 3.6806557755585064e-06, "loss": 0.3502, "step": 10950 }, { "epoch": 0.6325935759429743, "grad_norm": 2.5929823326561103, "learning_rate": 3.674883103388559e-06, "loss": 0.3477, "step": 10960 }, { "epoch": 0.6331707598626303, "grad_norm": 3.0643397308226903, "learning_rate": 3.6691104312186116e-06, "loss": 0.3511, "step": 10970 }, { "epoch": 0.6337479437822863, "grad_norm": 3.725143554468828, "learning_rate": 3.663337759048664e-06, "loss": 0.3597, "step": 10980 }, { "epoch": 0.6343251277019423, "grad_norm": 2.332988363561149, "learning_rate": 3.6575650868787168e-06, "loss": 0.3547, "step": 10990 }, { "epoch": 0.6349023116215983, "grad_norm": 4.338506151135665, "learning_rate": 3.651792414708769e-06, "loss": 0.3621, "step": 11000 }, { "epoch": 0.6354794955412543, "grad_norm": 5.853920773449472, "learning_rate": 3.6460197425388215e-06, "loss": 0.3467, "step": 11010 }, { "epoch": 0.6360566794609103, "grad_norm": 2.9801395957721217, "learning_rate": 3.640247070368874e-06, "loss": 0.3533, "step": 11020 }, { "epoch": 0.6366338633805663, "grad_norm": 5.428993197624115, "learning_rate": 3.6344743981989267e-06, "loss": 0.3477, "step": 11030 }, { "epoch": 0.6372110473002223, "grad_norm": 2.1575911965605914, "learning_rate": 3.6287017260289793e-06, "loss": 0.3463, "step": 11040 }, { "epoch": 0.6377882312198783, "grad_norm": 3.3210877709918982, "learning_rate": 3.622929053859032e-06, "loss": 0.3546, "step": 11050 }, { "epoch": 0.6383654151395343, "grad_norm": 2.686843207231148, "learning_rate": 3.617156381689084e-06, "loss": 0.3518, "step": 11060 }, { "epoch": 0.6389425990591903, "grad_norm": 5.280345153851947, "learning_rate": 3.6113837095191367e-06, "loss": 0.3579, "step": 11070 }, { "epoch": 0.6395197829788463, "grad_norm": 5.403871542937742, "learning_rate": 3.6056110373491893e-06, "loss": 0.3489, "step": 11080 }, { "epoch": 0.6400969668985023, "grad_norm": 2.9735701887326833, "learning_rate": 3.599838365179242e-06, "loss": 0.3502, "step": 11090 }, { "epoch": 0.6406741508181583, "grad_norm": 2.6891252705595368, "learning_rate": 3.5940656930092945e-06, "loss": 0.3612, "step": 11100 }, { "epoch": 0.6412513347378141, "grad_norm": 4.660072834904341, "learning_rate": 3.5882930208393467e-06, "loss": 0.3408, "step": 11110 }, { "epoch": 0.6418285186574701, "grad_norm": 8.181225664129359, "learning_rate": 3.5825203486693993e-06, "loss": 0.3466, "step": 11120 }, { "epoch": 0.6424057025771261, "grad_norm": 3.8250794191372943, "learning_rate": 3.576747676499452e-06, "loss": 0.3428, "step": 11130 }, { "epoch": 0.6429828864967821, "grad_norm": 2.5770691997974975, "learning_rate": 3.5709750043295044e-06, "loss": 0.3617, "step": 11140 }, { "epoch": 0.6435600704164381, "grad_norm": 3.518147076569533, "learning_rate": 3.565202332159557e-06, "loss": 0.3598, "step": 11150 }, { "epoch": 0.6441372543360941, "grad_norm": 2.3978173360258332, "learning_rate": 3.5594296599896096e-06, "loss": 0.3486, "step": 11160 }, { "epoch": 0.6447144382557501, "grad_norm": 4.963521259349147, "learning_rate": 3.553656987819662e-06, "loss": 0.3465, "step": 11170 }, { "epoch": 0.6452916221754061, "grad_norm": 3.7768523495827, "learning_rate": 3.5478843156497144e-06, "loss": 0.3507, "step": 11180 }, { "epoch": 0.6458688060950621, "grad_norm": 1.9990854220435814, "learning_rate": 3.542111643479767e-06, "loss": 0.3485, "step": 11190 }, { "epoch": 0.6464459900147181, "grad_norm": 2.434785484655442, "learning_rate": 3.5363389713098196e-06, "loss": 0.3414, "step": 11200 }, { "epoch": 0.6470231739343741, "grad_norm": 3.764273559187499, "learning_rate": 3.530566299139872e-06, "loss": 0.3475, "step": 11210 }, { "epoch": 0.6476003578540301, "grad_norm": 3.063611287505477, "learning_rate": 3.5247936269699244e-06, "loss": 0.3487, "step": 11220 }, { "epoch": 0.6481775417736861, "grad_norm": 2.7961532657594357, "learning_rate": 3.519020954799977e-06, "loss": 0.3591, "step": 11230 }, { "epoch": 0.6487547256933421, "grad_norm": 5.675631273424128, "learning_rate": 3.5132482826300296e-06, "loss": 0.3387, "step": 11240 }, { "epoch": 0.6493319096129981, "grad_norm": 4.038281786465871, "learning_rate": 3.507475610460082e-06, "loss": 0.3544, "step": 11250 }, { "epoch": 0.6499090935326541, "grad_norm": 19.461560362822837, "learning_rate": 3.5017029382901348e-06, "loss": 0.3612, "step": 11260 }, { "epoch": 0.6504862774523101, "grad_norm": 2.9170938007747838, "learning_rate": 3.4959302661201874e-06, "loss": 0.3633, "step": 11270 }, { "epoch": 0.6510634613719661, "grad_norm": 2.100392402638713, "learning_rate": 3.4901575939502395e-06, "loss": 0.3628, "step": 11280 }, { "epoch": 0.6516406452916221, "grad_norm": 7.466900360838518, "learning_rate": 3.484384921780292e-06, "loss": 0.3544, "step": 11290 }, { "epoch": 0.6522178292112781, "grad_norm": 2.3522582138412984, "learning_rate": 3.4786122496103447e-06, "loss": 0.3768, "step": 11300 }, { "epoch": 0.6527950131309341, "grad_norm": 2.0677131586462556, "learning_rate": 3.4728395774403973e-06, "loss": 0.3514, "step": 11310 }, { "epoch": 0.6533721970505901, "grad_norm": 3.1804108497752943, "learning_rate": 3.46706690527045e-06, "loss": 0.3506, "step": 11320 }, { "epoch": 0.6539493809702461, "grad_norm": 5.396390003664786, "learning_rate": 3.461294233100502e-06, "loss": 0.3539, "step": 11330 }, { "epoch": 0.6545265648899021, "grad_norm": 5.1304776342645235, "learning_rate": 3.4555215609305547e-06, "loss": 0.3578, "step": 11340 }, { "epoch": 0.6551037488095581, "grad_norm": 2.625555244563686, "learning_rate": 3.4497488887606073e-06, "loss": 0.3344, "step": 11350 }, { "epoch": 0.6556809327292141, "grad_norm": 2.765166389820208, "learning_rate": 3.44397621659066e-06, "loss": 0.3467, "step": 11360 }, { "epoch": 0.6562581166488701, "grad_norm": 3.1348291760774556, "learning_rate": 3.4382035444207125e-06, "loss": 0.348, "step": 11370 }, { "epoch": 0.6568353005685261, "grad_norm": 2.6149817919704486, "learning_rate": 3.4324308722507655e-06, "loss": 0.3581, "step": 11380 }, { "epoch": 0.6574124844881821, "grad_norm": 2.4071247325320084, "learning_rate": 3.426658200080818e-06, "loss": 0.3425, "step": 11390 }, { "epoch": 0.6579896684078381, "grad_norm": 6.966272376463285, "learning_rate": 3.4208855279108703e-06, "loss": 0.3648, "step": 11400 }, { "epoch": 0.6585668523274941, "grad_norm": 2.410341785080001, "learning_rate": 3.415112855740923e-06, "loss": 0.3518, "step": 11410 }, { "epoch": 0.6591440362471501, "grad_norm": 4.052333811947672, "learning_rate": 3.4093401835709755e-06, "loss": 0.3538, "step": 11420 }, { "epoch": 0.6597212201668061, "grad_norm": 5.112675962153542, "learning_rate": 3.403567511401028e-06, "loss": 0.3462, "step": 11430 }, { "epoch": 0.6602984040864621, "grad_norm": 2.0711228360250873, "learning_rate": 3.3977948392310807e-06, "loss": 0.3523, "step": 11440 }, { "epoch": 0.6608755880061181, "grad_norm": 3.0723705312379677, "learning_rate": 3.3920221670611333e-06, "loss": 0.3538, "step": 11450 }, { "epoch": 0.6614527719257741, "grad_norm": 7.22672591716136, "learning_rate": 3.3862494948911854e-06, "loss": 0.3508, "step": 11460 }, { "epoch": 0.6620299558454301, "grad_norm": 3.4674616674239447, "learning_rate": 3.380476822721238e-06, "loss": 0.3488, "step": 11470 }, { "epoch": 0.6626071397650861, "grad_norm": 3.0526763692239602, "learning_rate": 3.3747041505512906e-06, "loss": 0.3413, "step": 11480 }, { "epoch": 0.6631843236847421, "grad_norm": 8.92552529404141, "learning_rate": 3.3689314783813432e-06, "loss": 0.3559, "step": 11490 }, { "epoch": 0.6637615076043981, "grad_norm": 7.126184068548845, "learning_rate": 3.363158806211396e-06, "loss": 0.3431, "step": 11500 }, { "epoch": 0.6643386915240541, "grad_norm": 3.6795312021204993, "learning_rate": 3.357386134041448e-06, "loss": 0.3644, "step": 11510 }, { "epoch": 0.6649158754437101, "grad_norm": 3.1394487426454765, "learning_rate": 3.3516134618715006e-06, "loss": 0.355, "step": 11520 }, { "epoch": 0.6654930593633661, "grad_norm": 3.115800444710574, "learning_rate": 3.345840789701553e-06, "loss": 0.3595, "step": 11530 }, { "epoch": 0.6660702432830221, "grad_norm": 3.04464377321414, "learning_rate": 3.3400681175316058e-06, "loss": 0.3474, "step": 11540 }, { "epoch": 0.6666474272026781, "grad_norm": 11.414003410738056, "learning_rate": 3.3342954453616584e-06, "loss": 0.344, "step": 11550 }, { "epoch": 0.6672246111223341, "grad_norm": 7.794881460371124, "learning_rate": 3.328522773191711e-06, "loss": 0.3489, "step": 11560 }, { "epoch": 0.6678017950419901, "grad_norm": 3.606879802040075, "learning_rate": 3.322750101021763e-06, "loss": 0.3542, "step": 11570 }, { "epoch": 0.6683789789616461, "grad_norm": 3.271233323948874, "learning_rate": 3.3169774288518158e-06, "loss": 0.3572, "step": 11580 }, { "epoch": 0.6689561628813021, "grad_norm": 5.312528784803595, "learning_rate": 3.3112047566818683e-06, "loss": 0.344, "step": 11590 }, { "epoch": 0.6695333468009581, "grad_norm": 4.414037045732359, "learning_rate": 3.305432084511921e-06, "loss": 0.3757, "step": 11600 }, { "epoch": 0.6701105307206141, "grad_norm": 4.420990727422642, "learning_rate": 3.2996594123419735e-06, "loss": 0.3361, "step": 11610 }, { "epoch": 0.6706877146402701, "grad_norm": 28.79897728988663, "learning_rate": 3.2938867401720257e-06, "loss": 0.3549, "step": 11620 }, { "epoch": 0.6712648985599261, "grad_norm": 5.938341510395738, "learning_rate": 3.2881140680020783e-06, "loss": 0.3528, "step": 11630 }, { "epoch": 0.6718420824795821, "grad_norm": 7.625014010774144, "learning_rate": 3.282341395832131e-06, "loss": 0.3569, "step": 11640 }, { "epoch": 0.6724192663992381, "grad_norm": 2.627772129382934, "learning_rate": 3.2765687236621835e-06, "loss": 0.341, "step": 11650 }, { "epoch": 0.6729964503188941, "grad_norm": 6.6462109495436765, "learning_rate": 3.270796051492236e-06, "loss": 0.3432, "step": 11660 }, { "epoch": 0.6735736342385501, "grad_norm": 4.140894146799749, "learning_rate": 3.2650233793222887e-06, "loss": 0.35, "step": 11670 }, { "epoch": 0.6741508181582061, "grad_norm": 7.023320528238819, "learning_rate": 3.259250707152341e-06, "loss": 0.3483, "step": 11680 }, { "epoch": 0.6747280020778621, "grad_norm": 3.56371765942958, "learning_rate": 3.2534780349823935e-06, "loss": 0.3456, "step": 11690 }, { "epoch": 0.6753051859975181, "grad_norm": 4.693701665628699, "learning_rate": 3.247705362812446e-06, "loss": 0.3556, "step": 11700 }, { "epoch": 0.6758823699171741, "grad_norm": 4.8769232133317955, "learning_rate": 3.2419326906424987e-06, "loss": 0.3406, "step": 11710 }, { "epoch": 0.6764595538368301, "grad_norm": 14.213756604351863, "learning_rate": 3.2361600184725513e-06, "loss": 0.3422, "step": 11720 }, { "epoch": 0.6770367377564861, "grad_norm": 4.009649954155962, "learning_rate": 3.2303873463026034e-06, "loss": 0.3449, "step": 11730 }, { "epoch": 0.6776139216761421, "grad_norm": 3.538273145119479, "learning_rate": 3.224614674132656e-06, "loss": 0.3457, "step": 11740 }, { "epoch": 0.6781911055957981, "grad_norm": 3.0329790960952026, "learning_rate": 3.2188420019627086e-06, "loss": 0.3479, "step": 11750 }, { "epoch": 0.6787682895154541, "grad_norm": 8.21491132526687, "learning_rate": 3.2130693297927612e-06, "loss": 0.3577, "step": 11760 }, { "epoch": 0.6793454734351101, "grad_norm": 4.981836103874383, "learning_rate": 3.207296657622814e-06, "loss": 0.355, "step": 11770 }, { "epoch": 0.6799226573547661, "grad_norm": 5.1547852515451975, "learning_rate": 3.2015239854528664e-06, "loss": 0.3392, "step": 11780 }, { "epoch": 0.6804998412744221, "grad_norm": 9.811331834930291, "learning_rate": 3.1957513132829186e-06, "loss": 0.3545, "step": 11790 }, { "epoch": 0.6810770251940781, "grad_norm": 4.083480395202693, "learning_rate": 3.189978641112971e-06, "loss": 0.3575, "step": 11800 }, { "epoch": 0.6816542091137341, "grad_norm": 4.764988431769556, "learning_rate": 3.184205968943024e-06, "loss": 0.3372, "step": 11810 }, { "epoch": 0.6822313930333901, "grad_norm": 6.794798971465098, "learning_rate": 3.1784332967730764e-06, "loss": 0.3539, "step": 11820 }, { "epoch": 0.6828085769530461, "grad_norm": 7.111672589883507, "learning_rate": 3.172660624603129e-06, "loss": 0.3544, "step": 11830 }, { "epoch": 0.6833857608727021, "grad_norm": 24.124344459801147, "learning_rate": 3.166887952433181e-06, "loss": 0.3602, "step": 11840 }, { "epoch": 0.6839629447923581, "grad_norm": 12.424128663469016, "learning_rate": 3.1611152802632338e-06, "loss": 0.3441, "step": 11850 }, { "epoch": 0.6845401287120141, "grad_norm": 3.903963843774075, "learning_rate": 3.1553426080932864e-06, "loss": 0.3572, "step": 11860 }, { "epoch": 0.6851173126316701, "grad_norm": 7.5954443349430525, "learning_rate": 3.149569935923339e-06, "loss": 0.3459, "step": 11870 }, { "epoch": 0.6856944965513261, "grad_norm": 5.147769651660261, "learning_rate": 3.1437972637533915e-06, "loss": 0.3443, "step": 11880 }, { "epoch": 0.6862716804709821, "grad_norm": 8.531813464808828, "learning_rate": 3.138024591583444e-06, "loss": 0.3427, "step": 11890 }, { "epoch": 0.6868488643906381, "grad_norm": 9.203737344622347, "learning_rate": 3.1322519194134963e-06, "loss": 0.3466, "step": 11900 }, { "epoch": 0.6874260483102941, "grad_norm": 4.1913039016792055, "learning_rate": 3.1264792472435498e-06, "loss": 0.3667, "step": 11910 }, { "epoch": 0.6880032322299501, "grad_norm": 4.984073528450747, "learning_rate": 3.120706575073602e-06, "loss": 0.3417, "step": 11920 }, { "epoch": 0.6885804161496061, "grad_norm": 4.3895825229927725, "learning_rate": 3.1149339029036545e-06, "loss": 0.3341, "step": 11930 }, { "epoch": 0.6891576000692621, "grad_norm": 5.323675472371107, "learning_rate": 3.109161230733707e-06, "loss": 0.3341, "step": 11940 }, { "epoch": 0.6897347839889181, "grad_norm": 4.263152619263457, "learning_rate": 3.1033885585637597e-06, "loss": 0.3463, "step": 11950 }, { "epoch": 0.6903119679085741, "grad_norm": 6.116151443830828, "learning_rate": 3.0976158863938123e-06, "loss": 0.3603, "step": 11960 }, { "epoch": 0.6908891518282301, "grad_norm": 10.96676225065857, "learning_rate": 3.0918432142238645e-06, "loss": 0.3317, "step": 11970 }, { "epoch": 0.6914663357478861, "grad_norm": 5.204338558399774, "learning_rate": 3.086070542053917e-06, "loss": 0.3432, "step": 11980 }, { "epoch": 0.6920435196675421, "grad_norm": 3.481643749818502, "learning_rate": 3.0802978698839697e-06, "loss": 0.3343, "step": 11990 }, { "epoch": 0.6926207035871981, "grad_norm": 5.422983882620972, "learning_rate": 3.0745251977140223e-06, "loss": 0.3444, "step": 12000 }, { "epoch": 0.6931978875068541, "grad_norm": 3.293726670681602, "learning_rate": 3.068752525544075e-06, "loss": 0.3536, "step": 12010 }, { "epoch": 0.6937750714265101, "grad_norm": 11.162281611641948, "learning_rate": 3.0629798533741275e-06, "loss": 0.3461, "step": 12020 }, { "epoch": 0.6943522553461661, "grad_norm": 4.538803769431588, "learning_rate": 3.0572071812041797e-06, "loss": 0.3404, "step": 12030 }, { "epoch": 0.6949294392658221, "grad_norm": 7.297266576912264, "learning_rate": 3.0514345090342323e-06, "loss": 0.3399, "step": 12040 }, { "epoch": 0.6955066231854781, "grad_norm": 4.498082885030529, "learning_rate": 3.045661836864285e-06, "loss": 0.3572, "step": 12050 }, { "epoch": 0.6960838071051341, "grad_norm": 6.71445000313715, "learning_rate": 3.0398891646943374e-06, "loss": 0.3456, "step": 12060 }, { "epoch": 0.6966609910247901, "grad_norm": 4.130838744263147, "learning_rate": 3.03411649252439e-06, "loss": 0.3382, "step": 12070 }, { "epoch": 0.6972381749444461, "grad_norm": 2.959998512168581, "learning_rate": 3.0283438203544422e-06, "loss": 0.3441, "step": 12080 }, { "epoch": 0.6978153588641021, "grad_norm": 8.68519096842326, "learning_rate": 3.022571148184495e-06, "loss": 0.3536, "step": 12090 }, { "epoch": 0.6983925427837581, "grad_norm": 6.068123748807202, "learning_rate": 3.0167984760145474e-06, "loss": 0.3336, "step": 12100 }, { "epoch": 0.6989697267034141, "grad_norm": 13.720643945389472, "learning_rate": 3.0110258038446e-06, "loss": 0.3453, "step": 12110 }, { "epoch": 0.6995469106230701, "grad_norm": 5.2345993949285115, "learning_rate": 3.0052531316746526e-06, "loss": 0.3514, "step": 12120 }, { "epoch": 0.7001240945427261, "grad_norm": 5.995140869193482, "learning_rate": 2.999480459504705e-06, "loss": 0.3465, "step": 12130 }, { "epoch": 0.7007012784623821, "grad_norm": 5.534508344244959, "learning_rate": 2.9937077873347574e-06, "loss": 0.3436, "step": 12140 }, { "epoch": 0.7012784623820381, "grad_norm": 3.837652212059965, "learning_rate": 2.98793511516481e-06, "loss": 0.347, "step": 12150 }, { "epoch": 0.7018556463016941, "grad_norm": 32.35178311186503, "learning_rate": 2.9821624429948626e-06, "loss": 0.3543, "step": 12160 }, { "epoch": 0.7024328302213501, "grad_norm": 4.1065566840261125, "learning_rate": 2.976389770824915e-06, "loss": 0.3421, "step": 12170 }, { "epoch": 0.7030100141410061, "grad_norm": 6.144592670774153, "learning_rate": 2.9706170986549678e-06, "loss": 0.3466, "step": 12180 }, { "epoch": 0.7035871980606621, "grad_norm": 5.684740566371751, "learning_rate": 2.96484442648502e-06, "loss": 0.349, "step": 12190 }, { "epoch": 0.704164381980318, "grad_norm": 5.234589933221641, "learning_rate": 2.9590717543150725e-06, "loss": 0.3221, "step": 12200 }, { "epoch": 0.704741565899974, "grad_norm": 5.9090591405034205, "learning_rate": 2.953299082145125e-06, "loss": 0.3461, "step": 12210 }, { "epoch": 0.70531874981963, "grad_norm": 5.161621785333446, "learning_rate": 2.9475264099751777e-06, "loss": 0.3343, "step": 12220 }, { "epoch": 0.705895933739286, "grad_norm": 7.454875045097898, "learning_rate": 2.9417537378052303e-06, "loss": 0.339, "step": 12230 }, { "epoch": 0.706473117658942, "grad_norm": 3.3533355785311936, "learning_rate": 2.935981065635283e-06, "loss": 0.3226, "step": 12240 }, { "epoch": 0.707050301578598, "grad_norm": 3.3526244501016507, "learning_rate": 2.930208393465335e-06, "loss": 0.3525, "step": 12250 }, { "epoch": 0.707627485498254, "grad_norm": 46.63609588889749, "learning_rate": 2.9244357212953877e-06, "loss": 0.3388, "step": 12260 }, { "epoch": 0.70820466941791, "grad_norm": 6.343222491694745, "learning_rate": 2.9186630491254403e-06, "loss": 0.3458, "step": 12270 }, { "epoch": 0.708781853337566, "grad_norm": 7.406012410848603, "learning_rate": 2.912890376955493e-06, "loss": 0.3523, "step": 12280 }, { "epoch": 0.709359037257222, "grad_norm": 4.391956311756113, "learning_rate": 2.9071177047855455e-06, "loss": 0.3462, "step": 12290 }, { "epoch": 0.709936221176878, "grad_norm": 7.365023031476813, "learning_rate": 2.9013450326155977e-06, "loss": 0.3507, "step": 12300 }, { "epoch": 0.710513405096534, "grad_norm": 8.355149496371373, "learning_rate": 2.8955723604456503e-06, "loss": 0.3516, "step": 12310 }, { "epoch": 0.71109058901619, "grad_norm": 4.12597154889129, "learning_rate": 2.889799688275703e-06, "loss": 0.3387, "step": 12320 }, { "epoch": 0.711667772935846, "grad_norm": 22.783055812157006, "learning_rate": 2.8840270161057555e-06, "loss": 0.3505, "step": 12330 }, { "epoch": 0.712244956855502, "grad_norm": 6.598877289364409, "learning_rate": 2.878254343935808e-06, "loss": 0.3511, "step": 12340 }, { "epoch": 0.712822140775158, "grad_norm": 6.658331270547365, "learning_rate": 2.8724816717658606e-06, "loss": 0.3472, "step": 12350 }, { "epoch": 0.713399324694814, "grad_norm": 4.249593904236529, "learning_rate": 2.866708999595913e-06, "loss": 0.3462, "step": 12360 }, { "epoch": 0.71397650861447, "grad_norm": 4.820541412534286, "learning_rate": 2.8609363274259654e-06, "loss": 0.3327, "step": 12370 }, { "epoch": 0.714553692534126, "grad_norm": 29.19756399575411, "learning_rate": 2.855163655256018e-06, "loss": 0.3423, "step": 12380 }, { "epoch": 0.715130876453782, "grad_norm": 2.5170693039148695, "learning_rate": 2.8493909830860706e-06, "loss": 0.3507, "step": 12390 }, { "epoch": 0.715708060373438, "grad_norm": 4.848573397975011, "learning_rate": 2.843618310916123e-06, "loss": 0.3614, "step": 12400 }, { "epoch": 0.716285244293094, "grad_norm": 8.083798553592858, "learning_rate": 2.8378456387461754e-06, "loss": 0.3444, "step": 12410 }, { "epoch": 0.71686242821275, "grad_norm": 6.013516986737268, "learning_rate": 2.832072966576228e-06, "loss": 0.3411, "step": 12420 }, { "epoch": 0.717439612132406, "grad_norm": 58.40825846546123, "learning_rate": 2.826300294406281e-06, "loss": 0.3639, "step": 12430 }, { "epoch": 0.718016796052062, "grad_norm": 3.486306193338465, "learning_rate": 2.8205276222363336e-06, "loss": 0.3508, "step": 12440 }, { "epoch": 0.718593979971718, "grad_norm": 11.117203715725822, "learning_rate": 2.814754950066386e-06, "loss": 0.3441, "step": 12450 }, { "epoch": 0.719171163891374, "grad_norm": 6.959568782940119, "learning_rate": 2.808982277896439e-06, "loss": 0.355, "step": 12460 }, { "epoch": 0.71974834781103, "grad_norm": 4.77575740578489, "learning_rate": 2.8032096057264914e-06, "loss": 0.3634, "step": 12470 }, { "epoch": 0.720325531730686, "grad_norm": 12.450518772632652, "learning_rate": 2.797436933556544e-06, "loss": 0.3454, "step": 12480 }, { "epoch": 0.720902715650342, "grad_norm": 4.606989122142115, "learning_rate": 2.791664261386596e-06, "loss": 0.338, "step": 12490 }, { "epoch": 0.721479899569998, "grad_norm": 4.456459932783003, "learning_rate": 2.7858915892166488e-06, "loss": 0.3474, "step": 12500 }, { "epoch": 0.722057083489654, "grad_norm": 4.348978440263518, "learning_rate": 2.7801189170467014e-06, "loss": 0.3488, "step": 12510 }, { "epoch": 0.72263426740931, "grad_norm": 3.220553815408838, "learning_rate": 2.774346244876754e-06, "loss": 0.3372, "step": 12520 }, { "epoch": 0.723211451328966, "grad_norm": 6.662149536726665, "learning_rate": 2.7685735727068065e-06, "loss": 0.3487, "step": 12530 }, { "epoch": 0.723788635248622, "grad_norm": 4.489888078931735, "learning_rate": 2.7628009005368587e-06, "loss": 0.3589, "step": 12540 }, { "epoch": 0.724365819168278, "grad_norm": 3.911221624507832, "learning_rate": 2.7570282283669113e-06, "loss": 0.3364, "step": 12550 }, { "epoch": 0.724943003087934, "grad_norm": 4.397535770990173, "learning_rate": 2.751255556196964e-06, "loss": 0.3468, "step": 12560 }, { "epoch": 0.72552018700759, "grad_norm": 4.138162758687126, "learning_rate": 2.7454828840270165e-06, "loss": 0.351, "step": 12570 }, { "epoch": 0.726097370927246, "grad_norm": 3.6393806674007396, "learning_rate": 2.739710211857069e-06, "loss": 0.3439, "step": 12580 }, { "epoch": 0.7266745548469019, "grad_norm": 10.959014317841367, "learning_rate": 2.7339375396871217e-06, "loss": 0.3466, "step": 12590 }, { "epoch": 0.7272517387665579, "grad_norm": 8.5956598139555, "learning_rate": 2.728164867517174e-06, "loss": 0.3474, "step": 12600 }, { "epoch": 0.7278289226862139, "grad_norm": 8.199922135307672, "learning_rate": 2.7223921953472265e-06, "loss": 0.3334, "step": 12610 }, { "epoch": 0.7284061066058699, "grad_norm": 3.4555204304735563, "learning_rate": 2.716619523177279e-06, "loss": 0.3437, "step": 12620 }, { "epoch": 0.7289832905255259, "grad_norm": 4.838169983017387, "learning_rate": 2.7108468510073317e-06, "loss": 0.3444, "step": 12630 }, { "epoch": 0.7295604744451819, "grad_norm": 3.294894158340646, "learning_rate": 2.7050741788373843e-06, "loss": 0.345, "step": 12640 }, { "epoch": 0.7301376583648379, "grad_norm": 3.886142779724859, "learning_rate": 2.6993015066674364e-06, "loss": 0.34, "step": 12650 }, { "epoch": 0.7307148422844939, "grad_norm": 3.1827555908179814, "learning_rate": 2.693528834497489e-06, "loss": 0.3383, "step": 12660 }, { "epoch": 0.7312920262041499, "grad_norm": 20.97526772421675, "learning_rate": 2.6877561623275416e-06, "loss": 0.3416, "step": 12670 }, { "epoch": 0.7318692101238059, "grad_norm": 3.474528368008189, "learning_rate": 2.6819834901575942e-06, "loss": 0.3359, "step": 12680 }, { "epoch": 0.7324463940434619, "grad_norm": 11.34967364860296, "learning_rate": 2.676210817987647e-06, "loss": 0.3581, "step": 12690 }, { "epoch": 0.7330235779631179, "grad_norm": 3.495369176137086, "learning_rate": 2.6704381458176994e-06, "loss": 0.3369, "step": 12700 }, { "epoch": 0.7336007618827739, "grad_norm": 4.744362874719428, "learning_rate": 2.6646654736477516e-06, "loss": 0.3502, "step": 12710 }, { "epoch": 0.7341779458024299, "grad_norm": 2.554299735229823, "learning_rate": 2.658892801477804e-06, "loss": 0.3547, "step": 12720 }, { "epoch": 0.7347551297220859, "grad_norm": 4.114324789692039, "learning_rate": 2.653120129307857e-06, "loss": 0.3406, "step": 12730 }, { "epoch": 0.7353323136417419, "grad_norm": 5.99465461001967, "learning_rate": 2.6473474571379094e-06, "loss": 0.3568, "step": 12740 }, { "epoch": 0.7359094975613979, "grad_norm": 5.08850145360636, "learning_rate": 2.641574784967962e-06, "loss": 0.3437, "step": 12750 }, { "epoch": 0.7364866814810539, "grad_norm": 2.901320860603511, "learning_rate": 2.635802112798014e-06, "loss": 0.334, "step": 12760 }, { "epoch": 0.7370638654007099, "grad_norm": 7.740697658957906, "learning_rate": 2.6300294406280668e-06, "loss": 0.3361, "step": 12770 }, { "epoch": 0.7376410493203659, "grad_norm": 6.245600892762995, "learning_rate": 2.6242567684581194e-06, "loss": 0.3308, "step": 12780 }, { "epoch": 0.7382182332400219, "grad_norm": 5.308126429822944, "learning_rate": 2.618484096288172e-06, "loss": 0.3527, "step": 12790 }, { "epoch": 0.7387954171596779, "grad_norm": 4.212291401570202, "learning_rate": 2.6127114241182245e-06, "loss": 0.3409, "step": 12800 }, { "epoch": 0.7393726010793339, "grad_norm": 3.7060833092802556, "learning_rate": 2.606938751948277e-06, "loss": 0.337, "step": 12810 }, { "epoch": 0.7399497849989899, "grad_norm": 13.303775807909831, "learning_rate": 2.6011660797783293e-06, "loss": 0.3521, "step": 12820 }, { "epoch": 0.7405269689186459, "grad_norm": 5.855236173414753, "learning_rate": 2.595393407608382e-06, "loss": 0.344, "step": 12830 }, { "epoch": 0.7411041528383019, "grad_norm": 2.862607162292994, "learning_rate": 2.5896207354384345e-06, "loss": 0.3386, "step": 12840 }, { "epoch": 0.7416813367579579, "grad_norm": 3.190707127525178, "learning_rate": 2.583848063268487e-06, "loss": 0.3451, "step": 12850 }, { "epoch": 0.7422585206776139, "grad_norm": 7.479465788278306, "learning_rate": 2.5780753910985397e-06, "loss": 0.3418, "step": 12860 }, { "epoch": 0.7428357045972699, "grad_norm": 3.9314452580240795, "learning_rate": 2.572302718928592e-06, "loss": 0.3558, "step": 12870 }, { "epoch": 0.7434128885169259, "grad_norm": 2.586918485101635, "learning_rate": 2.5665300467586445e-06, "loss": 0.3521, "step": 12880 }, { "epoch": 0.7439900724365819, "grad_norm": 5.285658124727487, "learning_rate": 2.560757374588697e-06, "loss": 0.3467, "step": 12890 }, { "epoch": 0.7445672563562379, "grad_norm": 16.29975276837285, "learning_rate": 2.5549847024187497e-06, "loss": 0.338, "step": 12900 }, { "epoch": 0.7451444402758939, "grad_norm": 5.18800369782506, "learning_rate": 2.5492120302488023e-06, "loss": 0.3423, "step": 12910 }, { "epoch": 0.7457216241955499, "grad_norm": 2.7621346083831737, "learning_rate": 2.543439358078855e-06, "loss": 0.3474, "step": 12920 }, { "epoch": 0.7462988081152059, "grad_norm": 9.136730427994907, "learning_rate": 2.537666685908907e-06, "loss": 0.3599, "step": 12930 }, { "epoch": 0.7468759920348619, "grad_norm": 4.494380679642752, "learning_rate": 2.5318940137389596e-06, "loss": 0.3509, "step": 12940 }, { "epoch": 0.7474531759545179, "grad_norm": 3.8278742483492554, "learning_rate": 2.5261213415690122e-06, "loss": 0.3419, "step": 12950 }, { "epoch": 0.7480303598741739, "grad_norm": 13.52933963754357, "learning_rate": 2.5203486693990653e-06, "loss": 0.3255, "step": 12960 }, { "epoch": 0.7486075437938299, "grad_norm": 4.13788172504123, "learning_rate": 2.514575997229118e-06, "loss": 0.3446, "step": 12970 }, { "epoch": 0.7491847277134859, "grad_norm": 6.859750860335564, "learning_rate": 2.5088033250591704e-06, "loss": 0.3586, "step": 12980 }, { "epoch": 0.7497619116331419, "grad_norm": 9.235304443739427, "learning_rate": 2.503030652889223e-06, "loss": 0.3393, "step": 12990 }, { "epoch": 0.7503390955527979, "grad_norm": 5.829087177972612, "learning_rate": 2.4972579807192752e-06, "loss": 0.3518, "step": 13000 }, { "epoch": 0.7509162794724539, "grad_norm": 4.209497926424704, "learning_rate": 2.491485308549328e-06, "loss": 0.3392, "step": 13010 }, { "epoch": 0.7514934633921099, "grad_norm": 7.292629657573057, "learning_rate": 2.48571263637938e-06, "loss": 0.3253, "step": 13020 }, { "epoch": 0.7520706473117659, "grad_norm": 2.647834968655611, "learning_rate": 2.4799399642094326e-06, "loss": 0.3304, "step": 13030 }, { "epoch": 0.7526478312314219, "grad_norm": 3.2625519939417638, "learning_rate": 2.474167292039485e-06, "loss": 0.3389, "step": 13040 }, { "epoch": 0.7532250151510779, "grad_norm": 11.316624355429083, "learning_rate": 2.4683946198695378e-06, "loss": 0.3369, "step": 13050 }, { "epoch": 0.7538021990707339, "grad_norm": 9.673257149513491, "learning_rate": 2.4626219476995904e-06, "loss": 0.3334, "step": 13060 }, { "epoch": 0.7543793829903899, "grad_norm": 4.342384289505803, "learning_rate": 2.4568492755296426e-06, "loss": 0.3464, "step": 13070 }, { "epoch": 0.7549565669100459, "grad_norm": 11.18476020427808, "learning_rate": 2.451076603359695e-06, "loss": 0.3427, "step": 13080 }, { "epoch": 0.7555337508297019, "grad_norm": 3.7726393590227274, "learning_rate": 2.445303931189748e-06, "loss": 0.334, "step": 13090 }, { "epoch": 0.7561109347493579, "grad_norm": 12.714136611436194, "learning_rate": 2.4395312590198008e-06, "loss": 0.3268, "step": 13100 }, { "epoch": 0.7566881186690139, "grad_norm": 2.1269070192971307, "learning_rate": 2.433758586849853e-06, "loss": 0.3441, "step": 13110 }, { "epoch": 0.7572653025886699, "grad_norm": 6.054989937046523, "learning_rate": 2.4279859146799055e-06, "loss": 0.3428, "step": 13120 }, { "epoch": 0.7578424865083259, "grad_norm": 4.270071372155915, "learning_rate": 2.422213242509958e-06, "loss": 0.3539, "step": 13130 }, { "epoch": 0.7584196704279819, "grad_norm": 3.1183499805433814, "learning_rate": 2.4164405703400107e-06, "loss": 0.3347, "step": 13140 }, { "epoch": 0.7589968543476379, "grad_norm": 8.120134607992474, "learning_rate": 2.4106678981700633e-06, "loss": 0.3513, "step": 13150 }, { "epoch": 0.7595740382672939, "grad_norm": 4.275672737070938, "learning_rate": 2.4048952260001155e-06, "loss": 0.3333, "step": 13160 }, { "epoch": 0.7601512221869499, "grad_norm": 4.309409121092216, "learning_rate": 2.399122553830168e-06, "loss": 0.342, "step": 13170 }, { "epoch": 0.7607284061066059, "grad_norm": 14.975672326288917, "learning_rate": 2.3933498816602207e-06, "loss": 0.3491, "step": 13180 }, { "epoch": 0.7613055900262619, "grad_norm": 3.3391093670284984, "learning_rate": 2.3875772094902733e-06, "loss": 0.3377, "step": 13190 }, { "epoch": 0.7618827739459179, "grad_norm": 9.934238745041949, "learning_rate": 2.381804537320326e-06, "loss": 0.3316, "step": 13200 }, { "epoch": 0.7624599578655739, "grad_norm": 3.323343149796346, "learning_rate": 2.3760318651503785e-06, "loss": 0.3217, "step": 13210 }, { "epoch": 0.7630371417852299, "grad_norm": 5.084030929299541, "learning_rate": 2.3702591929804307e-06, "loss": 0.3582, "step": 13220 }, { "epoch": 0.7636143257048859, "grad_norm": 7.012741879467996, "learning_rate": 2.3644865208104833e-06, "loss": 0.3433, "step": 13230 }, { "epoch": 0.7641915096245419, "grad_norm": 4.5407340579502025, "learning_rate": 2.358713848640536e-06, "loss": 0.3388, "step": 13240 }, { "epoch": 0.7647686935441979, "grad_norm": 4.194705290127997, "learning_rate": 2.3529411764705885e-06, "loss": 0.3312, "step": 13250 }, { "epoch": 0.7653458774638539, "grad_norm": 3.7140413786909203, "learning_rate": 2.347168504300641e-06, "loss": 0.3496, "step": 13260 }, { "epoch": 0.7659230613835099, "grad_norm": 3.257080159852974, "learning_rate": 2.3413958321306936e-06, "loss": 0.3366, "step": 13270 }, { "epoch": 0.7665002453031659, "grad_norm": 5.234732296881502, "learning_rate": 2.335623159960746e-06, "loss": 0.3427, "step": 13280 }, { "epoch": 0.7670774292228218, "grad_norm": 6.870037441960529, "learning_rate": 2.3298504877907984e-06, "loss": 0.3354, "step": 13290 }, { "epoch": 0.7676546131424778, "grad_norm": 9.876776601961089, "learning_rate": 2.324077815620851e-06, "loss": 0.3315, "step": 13300 }, { "epoch": 0.7682317970621338, "grad_norm": 5.912982961911957, "learning_rate": 2.3183051434509036e-06, "loss": 0.3485, "step": 13310 }, { "epoch": 0.7688089809817898, "grad_norm": 4.832716437104076, "learning_rate": 2.312532471280956e-06, "loss": 0.3348, "step": 13320 }, { "epoch": 0.7693861649014458, "grad_norm": 5.098375721379664, "learning_rate": 2.3067597991110084e-06, "loss": 0.3484, "step": 13330 }, { "epoch": 0.7699633488211018, "grad_norm": 4.63656346157866, "learning_rate": 2.300987126941061e-06, "loss": 0.3366, "step": 13340 }, { "epoch": 0.7705405327407578, "grad_norm": 6.588466951994697, "learning_rate": 2.2952144547711136e-06, "loss": 0.3457, "step": 13350 }, { "epoch": 0.7711177166604138, "grad_norm": 4.824094255832894, "learning_rate": 2.2894417826011666e-06, "loss": 0.341, "step": 13360 }, { "epoch": 0.7716949005800698, "grad_norm": 4.637581573877111, "learning_rate": 2.2836691104312188e-06, "loss": 0.3554, "step": 13370 }, { "epoch": 0.7722720844997258, "grad_norm": 6.056831078560241, "learning_rate": 2.2778964382612714e-06, "loss": 0.3186, "step": 13380 }, { "epoch": 0.7728492684193818, "grad_norm": 45.47481090136826, "learning_rate": 2.272123766091324e-06, "loss": 0.3299, "step": 13390 }, { "epoch": 0.7734264523390378, "grad_norm": 22.884051964592864, "learning_rate": 2.2663510939213766e-06, "loss": 0.3392, "step": 13400 }, { "epoch": 0.7740036362586938, "grad_norm": 6.492791637464672, "learning_rate": 2.260578421751429e-06, "loss": 0.3436, "step": 13410 }, { "epoch": 0.7745808201783498, "grad_norm": 5.820865415832969, "learning_rate": 2.2548057495814813e-06, "loss": 0.3347, "step": 13420 }, { "epoch": 0.7751580040980058, "grad_norm": 3.6093618399609406, "learning_rate": 2.249033077411534e-06, "loss": 0.3457, "step": 13430 }, { "epoch": 0.7757351880176618, "grad_norm": 3.8630308240200977, "learning_rate": 2.2432604052415865e-06, "loss": 0.345, "step": 13440 }, { "epoch": 0.7763123719373178, "grad_norm": 5.534539731109275, "learning_rate": 2.237487733071639e-06, "loss": 0.3383, "step": 13450 }, { "epoch": 0.7768895558569738, "grad_norm": 9.50423287415909, "learning_rate": 2.2317150609016917e-06, "loss": 0.3528, "step": 13460 }, { "epoch": 0.7774667397766298, "grad_norm": 11.597704923923128, "learning_rate": 2.2259423887317443e-06, "loss": 0.3482, "step": 13470 }, { "epoch": 0.7780439236962858, "grad_norm": 4.724944252593318, "learning_rate": 2.2201697165617965e-06, "loss": 0.341, "step": 13480 }, { "epoch": 0.7786211076159418, "grad_norm": 4.741588520697521, "learning_rate": 2.214397044391849e-06, "loss": 0.3443, "step": 13490 }, { "epoch": 0.7791982915355978, "grad_norm": 4.723745867093743, "learning_rate": 2.2086243722219017e-06, "loss": 0.3468, "step": 13500 }, { "epoch": 0.7797754754552538, "grad_norm": 4.302802386626385, "learning_rate": 2.2028517000519543e-06, "loss": 0.3534, "step": 13510 }, { "epoch": 0.7803526593749098, "grad_norm": 3.6957459272751385, "learning_rate": 2.197079027882007e-06, "loss": 0.3438, "step": 13520 }, { "epoch": 0.7809298432945658, "grad_norm": 9.02964602805584, "learning_rate": 2.191306355712059e-06, "loss": 0.3507, "step": 13530 }, { "epoch": 0.7815070272142218, "grad_norm": 4.584120438677978, "learning_rate": 2.1855336835421117e-06, "loss": 0.3392, "step": 13540 }, { "epoch": 0.7820842111338778, "grad_norm": 3.634370316749477, "learning_rate": 2.1797610113721642e-06, "loss": 0.3434, "step": 13550 }, { "epoch": 0.7826613950535338, "grad_norm": 7.974537610574205, "learning_rate": 2.173988339202217e-06, "loss": 0.3488, "step": 13560 }, { "epoch": 0.7832385789731898, "grad_norm": 15.500270051358006, "learning_rate": 2.1682156670322694e-06, "loss": 0.3319, "step": 13570 }, { "epoch": 0.7838157628928458, "grad_norm": 3.6214865336598288, "learning_rate": 2.162442994862322e-06, "loss": 0.3431, "step": 13580 }, { "epoch": 0.7843929468125018, "grad_norm": 4.251663962722991, "learning_rate": 2.1566703226923742e-06, "loss": 0.339, "step": 13590 }, { "epoch": 0.7849701307321578, "grad_norm": 5.8477391728665875, "learning_rate": 2.150897650522427e-06, "loss": 0.3425, "step": 13600 }, { "epoch": 0.7855473146518138, "grad_norm": 12.661348588140084, "learning_rate": 2.1451249783524794e-06, "loss": 0.3385, "step": 13610 }, { "epoch": 0.7861244985714698, "grad_norm": 3.153875030872274, "learning_rate": 2.139352306182532e-06, "loss": 0.3429, "step": 13620 }, { "epoch": 0.7867016824911258, "grad_norm": 7.53800643811605, "learning_rate": 2.1335796340125846e-06, "loss": 0.3492, "step": 13630 }, { "epoch": 0.7872788664107818, "grad_norm": 4.143985301762935, "learning_rate": 2.127806961842637e-06, "loss": 0.3361, "step": 13640 }, { "epoch": 0.7878560503304378, "grad_norm": 3.005269949918135, "learning_rate": 2.12203428967269e-06, "loss": 0.3405, "step": 13650 }, { "epoch": 0.7884332342500938, "grad_norm": 3.018587955955484, "learning_rate": 2.1162616175027424e-06, "loss": 0.3447, "step": 13660 }, { "epoch": 0.7890104181697498, "grad_norm": 3.002589774112856, "learning_rate": 2.110488945332795e-06, "loss": 0.3253, "step": 13670 }, { "epoch": 0.7895876020894058, "grad_norm": 3.85103590680487, "learning_rate": 2.104716273162847e-06, "loss": 0.3395, "step": 13680 }, { "epoch": 0.7901647860090618, "grad_norm": 4.040110821545416, "learning_rate": 2.0989436009928998e-06, "loss": 0.3439, "step": 13690 }, { "epoch": 0.7907419699287178, "grad_norm": 8.395833969271274, "learning_rate": 2.0931709288229524e-06, "loss": 0.3334, "step": 13700 }, { "epoch": 0.7913191538483738, "grad_norm": 8.808629294549078, "learning_rate": 2.087398256653005e-06, "loss": 0.3516, "step": 13710 }, { "epoch": 0.7918963377680298, "grad_norm": 5.183013748264493, "learning_rate": 2.0816255844830576e-06, "loss": 0.3468, "step": 13720 }, { "epoch": 0.7924735216876858, "grad_norm": 4.651858039147579, "learning_rate": 2.0758529123131097e-06, "loss": 0.3383, "step": 13730 }, { "epoch": 0.7930507056073418, "grad_norm": 5.147274649825693, "learning_rate": 2.0700802401431623e-06, "loss": 0.3435, "step": 13740 }, { "epoch": 0.7936278895269978, "grad_norm": 3.3157064357282646, "learning_rate": 2.064307567973215e-06, "loss": 0.3374, "step": 13750 }, { "epoch": 0.7942050734466538, "grad_norm": 5.359738899671234, "learning_rate": 2.0585348958032675e-06, "loss": 0.3519, "step": 13760 }, { "epoch": 0.7947822573663098, "grad_norm": 3.232416306077255, "learning_rate": 2.05276222363332e-06, "loss": 0.3466, "step": 13770 }, { "epoch": 0.7953594412859658, "grad_norm": 5.634484175482586, "learning_rate": 2.0469895514633727e-06, "loss": 0.3435, "step": 13780 }, { "epoch": 0.7959366252056218, "grad_norm": 4.812177943413768, "learning_rate": 2.041216879293425e-06, "loss": 0.3394, "step": 13790 }, { "epoch": 0.7965138091252778, "grad_norm": 4.18690284048581, "learning_rate": 2.0354442071234775e-06, "loss": 0.339, "step": 13800 }, { "epoch": 0.7970909930449338, "grad_norm": 2.6302278323472024, "learning_rate": 2.02967153495353e-06, "loss": 0.3211, "step": 13810 }, { "epoch": 0.7976681769645898, "grad_norm": 3.7677609162356567, "learning_rate": 2.0238988627835827e-06, "loss": 0.3459, "step": 13820 }, { "epoch": 0.7982453608842458, "grad_norm": 3.877790721238228, "learning_rate": 2.0181261906136353e-06, "loss": 0.3527, "step": 13830 }, { "epoch": 0.7988225448039018, "grad_norm": 3.3882391753576897, "learning_rate": 2.0123535184436874e-06, "loss": 0.3418, "step": 13840 }, { "epoch": 0.7993997287235578, "grad_norm": 8.275704148522843, "learning_rate": 2.00658084627374e-06, "loss": 0.3584, "step": 13850 }, { "epoch": 0.7999769126432138, "grad_norm": 4.07120977222577, "learning_rate": 2.0008081741037926e-06, "loss": 0.3369, "step": 13860 }, { "epoch": 0.8005540965628698, "grad_norm": 6.796016189984939, "learning_rate": 1.9950355019338452e-06, "loss": 0.3359, "step": 13870 }, { "epoch": 0.8011312804825258, "grad_norm": 4.70618096154267, "learning_rate": 1.989262829763898e-06, "loss": 0.3383, "step": 13880 }, { "epoch": 0.8017084644021818, "grad_norm": 9.622472004428674, "learning_rate": 1.9834901575939504e-06, "loss": 0.3492, "step": 13890 }, { "epoch": 0.8022856483218378, "grad_norm": 20.930166870041855, "learning_rate": 1.977717485424003e-06, "loss": 0.3343, "step": 13900 }, { "epoch": 0.8028628322414938, "grad_norm": 10.85418413095689, "learning_rate": 1.9719448132540556e-06, "loss": 0.3399, "step": 13910 }, { "epoch": 0.8034400161611498, "grad_norm": 3.9293112562677353, "learning_rate": 1.9661721410841082e-06, "loss": 0.363, "step": 13920 }, { "epoch": 0.8040172000808058, "grad_norm": 10.773909045381199, "learning_rate": 1.960399468914161e-06, "loss": 0.3427, "step": 13930 }, { "epoch": 0.8045943840004618, "grad_norm": 4.932078250676689, "learning_rate": 1.954626796744213e-06, "loss": 0.3353, "step": 13940 }, { "epoch": 0.8051715679201178, "grad_norm": 5.376321864962683, "learning_rate": 1.9488541245742656e-06, "loss": 0.3502, "step": 13950 }, { "epoch": 0.8057487518397738, "grad_norm": 4.014977881261309, "learning_rate": 1.943081452404318e-06, "loss": 0.3169, "step": 13960 }, { "epoch": 0.8063259357594298, "grad_norm": 8.651359182721874, "learning_rate": 1.9373087802343708e-06, "loss": 0.3349, "step": 13970 }, { "epoch": 0.8069031196790858, "grad_norm": 4.683539873403375, "learning_rate": 1.9315361080644234e-06, "loss": 0.3366, "step": 13980 }, { "epoch": 0.8074803035987418, "grad_norm": 7.755677266741849, "learning_rate": 1.9257634358944756e-06, "loss": 0.3377, "step": 13990 }, { "epoch": 0.8080574875183978, "grad_norm": 14.533972050557356, "learning_rate": 1.919990763724528e-06, "loss": 0.3337, "step": 14000 }, { "epoch": 0.8086346714380538, "grad_norm": 3.5287179981983052, "learning_rate": 1.9142180915545807e-06, "loss": 0.3371, "step": 14010 }, { "epoch": 0.8092118553577098, "grad_norm": 3.6897961566375588, "learning_rate": 1.9084454193846333e-06, "loss": 0.3437, "step": 14020 }, { "epoch": 0.8097890392773658, "grad_norm": 4.275916003879041, "learning_rate": 1.9026727472146857e-06, "loss": 0.3505, "step": 14030 }, { "epoch": 0.8103662231970218, "grad_norm": 5.853581087992128, "learning_rate": 1.8969000750447383e-06, "loss": 0.3447, "step": 14040 }, { "epoch": 0.8109434071166778, "grad_norm": 4.456219664367194, "learning_rate": 1.891127402874791e-06, "loss": 0.3405, "step": 14050 }, { "epoch": 0.8115205910363338, "grad_norm": 3.609665854590679, "learning_rate": 1.8853547307048433e-06, "loss": 0.3539, "step": 14060 }, { "epoch": 0.8120977749559897, "grad_norm": 8.81894657017933, "learning_rate": 1.879582058534896e-06, "loss": 0.3434, "step": 14070 }, { "epoch": 0.8126749588756457, "grad_norm": 73.62257234022623, "learning_rate": 1.8738093863649485e-06, "loss": 0.3362, "step": 14080 }, { "epoch": 0.8132521427953017, "grad_norm": 4.843976388543906, "learning_rate": 1.8680367141950009e-06, "loss": 0.3522, "step": 14090 }, { "epoch": 0.8138293267149577, "grad_norm": 6.3647153836539925, "learning_rate": 1.8622640420250535e-06, "loss": 0.3351, "step": 14100 }, { "epoch": 0.8144065106346137, "grad_norm": 15.067604508734842, "learning_rate": 1.8564913698551059e-06, "loss": 0.341, "step": 14110 }, { "epoch": 0.8149836945542697, "grad_norm": 2.837118277081176, "learning_rate": 1.8507186976851585e-06, "loss": 0.3433, "step": 14120 }, { "epoch": 0.8155608784739257, "grad_norm": 3.8243460158503337, "learning_rate": 1.844946025515211e-06, "loss": 0.3433, "step": 14130 }, { "epoch": 0.8161380623935817, "grad_norm": 5.476403512789154, "learning_rate": 1.8391733533452635e-06, "loss": 0.3507, "step": 14140 }, { "epoch": 0.8167152463132377, "grad_norm": 7.503101973214223, "learning_rate": 1.8334006811753163e-06, "loss": 0.3426, "step": 14150 }, { "epoch": 0.8172924302328937, "grad_norm": 13.62010751923973, "learning_rate": 1.8276280090053689e-06, "loss": 0.3467, "step": 14160 }, { "epoch": 0.8178696141525497, "grad_norm": 5.792511670187818, "learning_rate": 1.8218553368354215e-06, "loss": 0.3618, "step": 14170 }, { "epoch": 0.8184467980722057, "grad_norm": 3.268214743983259, "learning_rate": 1.8160826646654738e-06, "loss": 0.3383, "step": 14180 }, { "epoch": 0.8190239819918617, "grad_norm": 4.2120443389536, "learning_rate": 1.8103099924955264e-06, "loss": 0.3444, "step": 14190 }, { "epoch": 0.8196011659115177, "grad_norm": 5.497732760033444, "learning_rate": 1.804537320325579e-06, "loss": 0.3321, "step": 14200 }, { "epoch": 0.8201783498311737, "grad_norm": 3.752608960335712, "learning_rate": 1.7987646481556314e-06, "loss": 0.34, "step": 14210 }, { "epoch": 0.8207555337508297, "grad_norm": 5.61044709148666, "learning_rate": 1.792991975985684e-06, "loss": 0.3477, "step": 14220 }, { "epoch": 0.8213327176704857, "grad_norm": 3.2289347743222, "learning_rate": 1.7872193038157364e-06, "loss": 0.3354, "step": 14230 }, { "epoch": 0.8219099015901417, "grad_norm": 4.132295269856051, "learning_rate": 1.781446631645789e-06, "loss": 0.3454, "step": 14240 }, { "epoch": 0.8224870855097977, "grad_norm": 8.94256483462556, "learning_rate": 1.7756739594758416e-06, "loss": 0.3437, "step": 14250 }, { "epoch": 0.8230642694294537, "grad_norm": 6.100086954024239, "learning_rate": 1.769901287305894e-06, "loss": 0.3358, "step": 14260 }, { "epoch": 0.8236414533491097, "grad_norm": 4.7378243458253255, "learning_rate": 1.7641286151359466e-06, "loss": 0.3357, "step": 14270 }, { "epoch": 0.8242186372687657, "grad_norm": 4.402684079414389, "learning_rate": 1.7583559429659992e-06, "loss": 0.346, "step": 14280 }, { "epoch": 0.8247958211884217, "grad_norm": 7.94048495060501, "learning_rate": 1.7525832707960516e-06, "loss": 0.3334, "step": 14290 }, { "epoch": 0.8253730051080777, "grad_norm": 4.248624676291536, "learning_rate": 1.7468105986261042e-06, "loss": 0.3348, "step": 14300 }, { "epoch": 0.8259501890277336, "grad_norm": 5.863500294171999, "learning_rate": 1.7410379264561568e-06, "loss": 0.3545, "step": 14310 }, { "epoch": 0.8265273729473896, "grad_norm": 5.462151225452732, "learning_rate": 1.7352652542862091e-06, "loss": 0.3324, "step": 14320 }, { "epoch": 0.8271045568670456, "grad_norm": 2.757724626485063, "learning_rate": 1.7294925821162617e-06, "loss": 0.3411, "step": 14330 }, { "epoch": 0.8276817407867016, "grad_norm": 2.5548734181876456, "learning_rate": 1.7237199099463141e-06, "loss": 0.3278, "step": 14340 }, { "epoch": 0.8282589247063576, "grad_norm": 4.871936623866052, "learning_rate": 1.7179472377763667e-06, "loss": 0.3306, "step": 14350 }, { "epoch": 0.8288361086260136, "grad_norm": 3.10101382610628, "learning_rate": 1.7121745656064193e-06, "loss": 0.3387, "step": 14360 }, { "epoch": 0.8294132925456696, "grad_norm": 10.575051742858964, "learning_rate": 1.7064018934364717e-06, "loss": 0.3428, "step": 14370 }, { "epoch": 0.8299904764653256, "grad_norm": 4.367916255458309, "learning_rate": 1.7006292212665243e-06, "loss": 0.3397, "step": 14380 }, { "epoch": 0.8305676603849816, "grad_norm": 6.616840660969775, "learning_rate": 1.694856549096577e-06, "loss": 0.331, "step": 14390 }, { "epoch": 0.8311448443046376, "grad_norm": 12.71189460753895, "learning_rate": 1.6890838769266293e-06, "loss": 0.3352, "step": 14400 }, { "epoch": 0.8317220282242936, "grad_norm": 8.817998813310439, "learning_rate": 1.683311204756682e-06, "loss": 0.3425, "step": 14410 }, { "epoch": 0.8322992121439496, "grad_norm": 4.766835462814113, "learning_rate": 1.6775385325867347e-06, "loss": 0.3398, "step": 14420 }, { "epoch": 0.8328763960636056, "grad_norm": 4.169057753174133, "learning_rate": 1.6717658604167873e-06, "loss": 0.3294, "step": 14430 }, { "epoch": 0.8334535799832616, "grad_norm": 5.150260088289917, "learning_rate": 1.6659931882468397e-06, "loss": 0.3273, "step": 14440 }, { "epoch": 0.8340307639029176, "grad_norm": 5.053684619580007, "learning_rate": 1.6602205160768923e-06, "loss": 0.3642, "step": 14450 }, { "epoch": 0.8346079478225736, "grad_norm": 4.068134996065944, "learning_rate": 1.6544478439069447e-06, "loss": 0.3441, "step": 14460 }, { "epoch": 0.8351851317422296, "grad_norm": 4.004964659229183, "learning_rate": 1.6486751717369972e-06, "loss": 0.3408, "step": 14470 }, { "epoch": 0.8357623156618856, "grad_norm": 4.02559591240971, "learning_rate": 1.6429024995670498e-06, "loss": 0.3385, "step": 14480 }, { "epoch": 0.8363394995815416, "grad_norm": 5.168203382157519, "learning_rate": 1.6371298273971022e-06, "loss": 0.3424, "step": 14490 }, { "epoch": 0.8369166835011976, "grad_norm": 6.3562693056901285, "learning_rate": 1.6313571552271548e-06, "loss": 0.3432, "step": 14500 }, { "epoch": 0.8374938674208536, "grad_norm": 7.105132387517162, "learning_rate": 1.6255844830572074e-06, "loss": 0.3385, "step": 14510 }, { "epoch": 0.8380710513405096, "grad_norm": 5.167657366774912, "learning_rate": 1.6198118108872598e-06, "loss": 0.3357, "step": 14520 }, { "epoch": 0.8386482352601656, "grad_norm": 5.322779082569645, "learning_rate": 1.6140391387173124e-06, "loss": 0.351, "step": 14530 }, { "epoch": 0.8392254191798216, "grad_norm": 5.555960049234855, "learning_rate": 1.608266466547365e-06, "loss": 0.3427, "step": 14540 }, { "epoch": 0.8398026030994776, "grad_norm": 14.30097637805443, "learning_rate": 1.6024937943774174e-06, "loss": 0.3268, "step": 14550 }, { "epoch": 0.8403797870191336, "grad_norm": 5.930941981679822, "learning_rate": 1.59672112220747e-06, "loss": 0.3378, "step": 14560 }, { "epoch": 0.8409569709387896, "grad_norm": 6.41989158034055, "learning_rate": 1.5909484500375224e-06, "loss": 0.3425, "step": 14570 }, { "epoch": 0.8415341548584456, "grad_norm": 5.309996017042821, "learning_rate": 1.585175777867575e-06, "loss": 0.347, "step": 14580 }, { "epoch": 0.8421113387781016, "grad_norm": 5.483533161522683, "learning_rate": 1.5794031056976276e-06, "loss": 0.3292, "step": 14590 }, { "epoch": 0.8426885226977576, "grad_norm": 5.692527937195376, "learning_rate": 1.57363043352768e-06, "loss": 0.3377, "step": 14600 }, { "epoch": 0.8432657066174136, "grad_norm": 4.521114906145438, "learning_rate": 1.5678577613577325e-06, "loss": 0.33, "step": 14610 }, { "epoch": 0.8438428905370696, "grad_norm": 5.387502803416387, "learning_rate": 1.5620850891877851e-06, "loss": 0.3418, "step": 14620 }, { "epoch": 0.8444200744567256, "grad_norm": 3.5934061953432783, "learning_rate": 1.5563124170178375e-06, "loss": 0.3386, "step": 14630 }, { "epoch": 0.8449972583763816, "grad_norm": 11.194143114734654, "learning_rate": 1.5505397448478901e-06, "loss": 0.345, "step": 14640 }, { "epoch": 0.8455744422960376, "grad_norm": 10.094088344395155, "learning_rate": 1.5447670726779427e-06, "loss": 0.3526, "step": 14650 }, { "epoch": 0.8461516262156936, "grad_norm": 3.4477998903369538, "learning_rate": 1.5389944005079951e-06, "loss": 0.3373, "step": 14660 }, { "epoch": 0.8467288101353496, "grad_norm": 7.047707090908251, "learning_rate": 1.533221728338048e-06, "loss": 0.3403, "step": 14670 }, { "epoch": 0.8473059940550056, "grad_norm": 2.8576436370742893, "learning_rate": 1.5274490561681005e-06, "loss": 0.3453, "step": 14680 }, { "epoch": 0.8478831779746616, "grad_norm": 12.938468126614902, "learning_rate": 1.521676383998153e-06, "loss": 0.336, "step": 14690 }, { "epoch": 0.8484603618943176, "grad_norm": 5.011512190053106, "learning_rate": 1.5159037118282055e-06, "loss": 0.3424, "step": 14700 }, { "epoch": 0.8490375458139736, "grad_norm": 5.063829942230378, "learning_rate": 1.510131039658258e-06, "loss": 0.346, "step": 14710 }, { "epoch": 0.8496147297336296, "grad_norm": 6.679043646426254, "learning_rate": 1.5043583674883105e-06, "loss": 0.3422, "step": 14720 }, { "epoch": 0.8501919136532856, "grad_norm": 5.800788583586723, "learning_rate": 1.498585695318363e-06, "loss": 0.3421, "step": 14730 }, { "epoch": 0.8507690975729416, "grad_norm": 3.968739599812534, "learning_rate": 1.4928130231484157e-06, "loss": 0.3395, "step": 14740 }, { "epoch": 0.8513462814925976, "grad_norm": 7.880056504602316, "learning_rate": 1.487040350978468e-06, "loss": 0.3326, "step": 14750 }, { "epoch": 0.8519234654122536, "grad_norm": 5.685089944033708, "learning_rate": 1.4812676788085207e-06, "loss": 0.3356, "step": 14760 }, { "epoch": 0.8525006493319096, "grad_norm": 4.795808126193561, "learning_rate": 1.475495006638573e-06, "loss": 0.316, "step": 14770 }, { "epoch": 0.8530778332515656, "grad_norm": 6.693555623774071, "learning_rate": 1.4697223344686256e-06, "loss": 0.3416, "step": 14780 }, { "epoch": 0.8536550171712216, "grad_norm": 5.752237694272574, "learning_rate": 1.4639496622986782e-06, "loss": 0.3348, "step": 14790 }, { "epoch": 0.8542322010908776, "grad_norm": 7.135726397784308, "learning_rate": 1.4581769901287306e-06, "loss": 0.3378, "step": 14800 }, { "epoch": 0.8548093850105336, "grad_norm": 3.2833774570611234, "learning_rate": 1.4524043179587832e-06, "loss": 0.3504, "step": 14810 }, { "epoch": 0.8553865689301896, "grad_norm": 6.256462721047408, "learning_rate": 1.4466316457888358e-06, "loss": 0.349, "step": 14820 }, { "epoch": 0.8559637528498456, "grad_norm": 6.040295806596955, "learning_rate": 1.4408589736188882e-06, "loss": 0.3208, "step": 14830 }, { "epoch": 0.8565409367695016, "grad_norm": 4.729301211824621, "learning_rate": 1.4350863014489408e-06, "loss": 0.3277, "step": 14840 }, { "epoch": 0.8571181206891576, "grad_norm": 4.313779706679082, "learning_rate": 1.4293136292789934e-06, "loss": 0.3306, "step": 14850 }, { "epoch": 0.8576953046088136, "grad_norm": 11.790043476255672, "learning_rate": 1.4235409571090458e-06, "loss": 0.3391, "step": 14860 }, { "epoch": 0.8582724885284696, "grad_norm": 7.642639050872643, "learning_rate": 1.4177682849390984e-06, "loss": 0.3388, "step": 14870 }, { "epoch": 0.8588496724481256, "grad_norm": 7.260077362208394, "learning_rate": 1.4119956127691508e-06, "loss": 0.3422, "step": 14880 }, { "epoch": 0.8594268563677816, "grad_norm": 4.754130590048299, "learning_rate": 1.4062229405992034e-06, "loss": 0.3436, "step": 14890 }, { "epoch": 0.8600040402874376, "grad_norm": 6.225554657816755, "learning_rate": 1.400450268429256e-06, "loss": 0.3352, "step": 14900 }, { "epoch": 0.8605812242070936, "grad_norm": 9.369073827925245, "learning_rate": 1.3946775962593083e-06, "loss": 0.3552, "step": 14910 }, { "epoch": 0.8611584081267496, "grad_norm": 22.876915271022913, "learning_rate": 1.388904924089361e-06, "loss": 0.3492, "step": 14920 }, { "epoch": 0.8617355920464056, "grad_norm": 3.2981136451706132, "learning_rate": 1.3831322519194135e-06, "loss": 0.3328, "step": 14930 }, { "epoch": 0.8623127759660616, "grad_norm": 15.61626227774467, "learning_rate": 1.3773595797494663e-06, "loss": 0.337, "step": 14940 }, { "epoch": 0.8628899598857176, "grad_norm": 3.605130100716397, "learning_rate": 1.3715869075795187e-06, "loss": 0.3385, "step": 14950 }, { "epoch": 0.8634671438053736, "grad_norm": 12.284387392936685, "learning_rate": 1.3658142354095713e-06, "loss": 0.338, "step": 14960 }, { "epoch": 0.8640443277250296, "grad_norm": 3.278014138295641, "learning_rate": 1.360041563239624e-06, "loss": 0.3337, "step": 14970 }, { "epoch": 0.8646215116446856, "grad_norm": 6.104850658183799, "learning_rate": 1.3542688910696763e-06, "loss": 0.3276, "step": 14980 }, { "epoch": 0.8651986955643416, "grad_norm": 5.279235884437097, "learning_rate": 1.348496218899729e-06, "loss": 0.3289, "step": 14990 }, { "epoch": 0.8657758794839976, "grad_norm": 8.449467479597608, "learning_rate": 1.3427235467297813e-06, "loss": 0.3314, "step": 15000 }, { "epoch": 0.8663530634036536, "grad_norm": 5.665683889458834, "learning_rate": 1.3369508745598339e-06, "loss": 0.3416, "step": 15010 }, { "epoch": 0.8669302473233096, "grad_norm": 3.9335607560335735, "learning_rate": 1.3311782023898865e-06, "loss": 0.3525, "step": 15020 }, { "epoch": 0.8675074312429656, "grad_norm": 2.894014459167942, "learning_rate": 1.3254055302199389e-06, "loss": 0.3328, "step": 15030 }, { "epoch": 0.8680846151626216, "grad_norm": 65.75688988910291, "learning_rate": 1.3196328580499915e-06, "loss": 0.331, "step": 15040 }, { "epoch": 0.8686617990822776, "grad_norm": 61.00701606633841, "learning_rate": 1.313860185880044e-06, "loss": 0.3407, "step": 15050 }, { "epoch": 0.8692389830019336, "grad_norm": 4.364812051967769, "learning_rate": 1.3080875137100965e-06, "loss": 0.335, "step": 15060 }, { "epoch": 0.8698161669215896, "grad_norm": 24.662511548298095, "learning_rate": 1.302314841540149e-06, "loss": 0.3348, "step": 15070 }, { "epoch": 0.8703933508412456, "grad_norm": 4.160466497716753, "learning_rate": 1.2965421693702016e-06, "loss": 0.3252, "step": 15080 }, { "epoch": 0.8709705347609016, "grad_norm": 6.134539334165056, "learning_rate": 1.290769497200254e-06, "loss": 0.343, "step": 15090 }, { "epoch": 0.8715477186805576, "grad_norm": 7.228713295937482, "learning_rate": 1.2849968250303066e-06, "loss": 0.3338, "step": 15100 }, { "epoch": 0.8721249026002136, "grad_norm": 4.918201123965152, "learning_rate": 1.279224152860359e-06, "loss": 0.3527, "step": 15110 }, { "epoch": 0.8727020865198696, "grad_norm": 11.079488033226095, "learning_rate": 1.2734514806904116e-06, "loss": 0.3485, "step": 15120 }, { "epoch": 0.8732792704395256, "grad_norm": 4.928609183215457, "learning_rate": 1.2676788085204642e-06, "loss": 0.3354, "step": 15130 }, { "epoch": 0.8738564543591816, "grad_norm": 6.740385950730952, "learning_rate": 1.2619061363505166e-06, "loss": 0.333, "step": 15140 }, { "epoch": 0.8744336382788376, "grad_norm": 4.67490770106929, "learning_rate": 1.2561334641805692e-06, "loss": 0.331, "step": 15150 }, { "epoch": 0.8750108221984936, "grad_norm": 13.018881009854521, "learning_rate": 1.2503607920106218e-06, "loss": 0.3418, "step": 15160 }, { "epoch": 0.8755880061181496, "grad_norm": 5.447550954730696, "learning_rate": 1.2445881198406744e-06, "loss": 0.3372, "step": 15170 }, { "epoch": 0.8761651900378056, "grad_norm": 3.273410656701021, "learning_rate": 1.238815447670727e-06, "loss": 0.3199, "step": 15180 }, { "epoch": 0.8767423739574616, "grad_norm": 4.277523732895546, "learning_rate": 1.2330427755007794e-06, "loss": 0.341, "step": 15190 }, { "epoch": 0.8773195578771176, "grad_norm": 15.742782882783079, "learning_rate": 1.227270103330832e-06, "loss": 0.341, "step": 15200 }, { "epoch": 0.8778967417967736, "grad_norm": 5.131013300289669, "learning_rate": 1.2214974311608844e-06, "loss": 0.3303, "step": 15210 }, { "epoch": 0.8784739257164296, "grad_norm": 12.866336037106171, "learning_rate": 1.215724758990937e-06, "loss": 0.3442, "step": 15220 }, { "epoch": 0.8790511096360856, "grad_norm": 5.127710193300859, "learning_rate": 1.2099520868209895e-06, "loss": 0.352, "step": 15230 }, { "epoch": 0.8796282935557416, "grad_norm": 7.005612361267027, "learning_rate": 1.204179414651042e-06, "loss": 0.3376, "step": 15240 }, { "epoch": 0.8802054774753976, "grad_norm": 19.994629507428854, "learning_rate": 1.1984067424810945e-06, "loss": 0.3366, "step": 15250 }, { "epoch": 0.8807826613950536, "grad_norm": 19.289162587657003, "learning_rate": 1.1926340703111471e-06, "loss": 0.3339, "step": 15260 }, { "epoch": 0.8813598453147096, "grad_norm": 5.780176197085242, "learning_rate": 1.1868613981411997e-06, "loss": 0.3328, "step": 15270 }, { "epoch": 0.8819370292343656, "grad_norm": 5.322625710215796, "learning_rate": 1.1810887259712523e-06, "loss": 0.3364, "step": 15280 }, { "epoch": 0.8825142131540216, "grad_norm": 4.160665784312188, "learning_rate": 1.1753160538013047e-06, "loss": 0.3298, "step": 15290 }, { "epoch": 0.8830913970736776, "grad_norm": 39.76071118055878, "learning_rate": 1.1695433816313573e-06, "loss": 0.348, "step": 15300 }, { "epoch": 0.8836685809933336, "grad_norm": 3.5659508030627958, "learning_rate": 1.16377070946141e-06, "loss": 0.3271, "step": 15310 }, { "epoch": 0.8842457649129896, "grad_norm": 4.436695102429374, "learning_rate": 1.1579980372914623e-06, "loss": 0.3407, "step": 15320 }, { "epoch": 0.8848229488326456, "grad_norm": 11.403167564876071, "learning_rate": 1.1522253651215149e-06, "loss": 0.3383, "step": 15330 }, { "epoch": 0.8854001327523016, "grad_norm": 4.535641723893359, "learning_rate": 1.1464526929515673e-06, "loss": 0.3328, "step": 15340 }, { "epoch": 0.8859773166719576, "grad_norm": 7.663265877666311, "learning_rate": 1.1406800207816199e-06, "loss": 0.3365, "step": 15350 }, { "epoch": 0.8865545005916136, "grad_norm": 7.820505381715719, "learning_rate": 1.1349073486116725e-06, "loss": 0.3432, "step": 15360 }, { "epoch": 0.8871316845112696, "grad_norm": 6.006599271526383, "learning_rate": 1.1291346764417248e-06, "loss": 0.3487, "step": 15370 }, { "epoch": 0.8877088684309256, "grad_norm": 11.881435307105928, "learning_rate": 1.1233620042717774e-06, "loss": 0.3332, "step": 15380 }, { "epoch": 0.8882860523505816, "grad_norm": 3.4819561819318103, "learning_rate": 1.11758933210183e-06, "loss": 0.335, "step": 15390 }, { "epoch": 0.8888632362702376, "grad_norm": 6.893650513052578, "learning_rate": 1.1118166599318826e-06, "loss": 0.3493, "step": 15400 }, { "epoch": 0.8894404201898936, "grad_norm": 6.925493159604081, "learning_rate": 1.1060439877619352e-06, "loss": 0.3448, "step": 15410 }, { "epoch": 0.8900176041095496, "grad_norm": 5.5200268397262775, "learning_rate": 1.1002713155919876e-06, "loss": 0.3256, "step": 15420 }, { "epoch": 0.8905947880292056, "grad_norm": 7.638890582405007, "learning_rate": 1.0944986434220402e-06, "loss": 0.3327, "step": 15430 }, { "epoch": 0.8911719719488616, "grad_norm": 4.585636430659814, "learning_rate": 1.0887259712520926e-06, "loss": 0.3386, "step": 15440 }, { "epoch": 0.8917491558685176, "grad_norm": 7.147252901738225, "learning_rate": 1.0829532990821452e-06, "loss": 0.3402, "step": 15450 }, { "epoch": 0.8923263397881736, "grad_norm": 5.40597402568476, "learning_rate": 1.0771806269121978e-06, "loss": 0.3385, "step": 15460 }, { "epoch": 0.8929035237078295, "grad_norm": 4.260919590930795, "learning_rate": 1.0714079547422502e-06, "loss": 0.3232, "step": 15470 }, { "epoch": 0.8934807076274855, "grad_norm": 9.083532528696407, "learning_rate": 1.0656352825723028e-06, "loss": 0.354, "step": 15480 }, { "epoch": 0.8940578915471415, "grad_norm": 4.996043459346209, "learning_rate": 1.0598626104023554e-06, "loss": 0.329, "step": 15490 }, { "epoch": 0.8946350754667975, "grad_norm": 4.117500728065101, "learning_rate": 1.0540899382324078e-06, "loss": 0.3269, "step": 15500 }, { "epoch": 0.8952122593864535, "grad_norm": 4.9248570764155435, "learning_rate": 1.0483172660624604e-06, "loss": 0.3321, "step": 15510 }, { "epoch": 0.8957894433061095, "grad_norm": 4.696024737813827, "learning_rate": 1.042544593892513e-06, "loss": 0.3309, "step": 15520 }, { "epoch": 0.8963666272257655, "grad_norm": 7.733578705149999, "learning_rate": 1.0367719217225656e-06, "loss": 0.3305, "step": 15530 }, { "epoch": 0.8969438111454215, "grad_norm": 5.394346330597671, "learning_rate": 1.0309992495526181e-06, "loss": 0.3388, "step": 15540 }, { "epoch": 0.8975209950650774, "grad_norm": 5.311667852016155, "learning_rate": 1.0252265773826705e-06, "loss": 0.3376, "step": 15550 }, { "epoch": 0.8980981789847334, "grad_norm": 25.908831946398365, "learning_rate": 1.0194539052127231e-06, "loss": 0.3265, "step": 15560 }, { "epoch": 0.8986753629043894, "grad_norm": 33.665974796607145, "learning_rate": 1.0136812330427755e-06, "loss": 0.3348, "step": 15570 }, { "epoch": 0.8992525468240454, "grad_norm": 6.825259095726871, "learning_rate": 1.0079085608728281e-06, "loss": 0.3453, "step": 15580 }, { "epoch": 0.8998297307437014, "grad_norm": 3.1735048623731092, "learning_rate": 1.0021358887028807e-06, "loss": 0.3218, "step": 15590 }, { "epoch": 0.9004069146633574, "grad_norm": 9.114656814093948, "learning_rate": 9.96363216532933e-07, "loss": 0.3228, "step": 15600 }, { "epoch": 0.9009840985830134, "grad_norm": 5.87155385500696, "learning_rate": 9.905905443629857e-07, "loss": 0.3309, "step": 15610 }, { "epoch": 0.9015612825026694, "grad_norm": 14.717991716657272, "learning_rate": 9.848178721930383e-07, "loss": 0.3486, "step": 15620 }, { "epoch": 0.9021384664223254, "grad_norm": 3.7767948462311067, "learning_rate": 9.790452000230907e-07, "loss": 0.3182, "step": 15630 }, { "epoch": 0.9027156503419814, "grad_norm": 3.6787944993169006, "learning_rate": 9.732725278531433e-07, "loss": 0.3336, "step": 15640 }, { "epoch": 0.9032928342616374, "grad_norm": 11.812395475228488, "learning_rate": 9.674998556831959e-07, "loss": 0.3286, "step": 15650 }, { "epoch": 0.9038700181812934, "grad_norm": 5.186436534605756, "learning_rate": 9.617271835132485e-07, "loss": 0.3275, "step": 15660 }, { "epoch": 0.9044472021009494, "grad_norm": 4.424154650673984, "learning_rate": 9.559545113433009e-07, "loss": 0.3185, "step": 15670 }, { "epoch": 0.9050243860206054, "grad_norm": 6.3580022358578105, "learning_rate": 9.501818391733534e-07, "loss": 0.3226, "step": 15680 }, { "epoch": 0.9056015699402614, "grad_norm": 6.310999676892304, "learning_rate": 9.444091670034059e-07, "loss": 0.3304, "step": 15690 }, { "epoch": 0.9061787538599174, "grad_norm": 3.7913010315172633, "learning_rate": 9.386364948334585e-07, "loss": 0.3412, "step": 15700 }, { "epoch": 0.9067559377795734, "grad_norm": 3.67407928861924, "learning_rate": 9.32863822663511e-07, "loss": 0.341, "step": 15710 }, { "epoch": 0.9073331216992294, "grad_norm": 7.334570042771433, "learning_rate": 9.270911504935635e-07, "loss": 0.3265, "step": 15720 }, { "epoch": 0.9079103056188854, "grad_norm": 6.456404235720165, "learning_rate": 9.213184783236161e-07, "loss": 0.3444, "step": 15730 }, { "epoch": 0.9084874895385414, "grad_norm": 4.201612752804453, "learning_rate": 9.155458061536686e-07, "loss": 0.348, "step": 15740 }, { "epoch": 0.9090646734581974, "grad_norm": 5.071365974662106, "learning_rate": 9.097731339837211e-07, "loss": 0.3153, "step": 15750 }, { "epoch": 0.9096418573778534, "grad_norm": 5.035641882967374, "learning_rate": 9.040004618137736e-07, "loss": 0.3359, "step": 15760 }, { "epoch": 0.9102190412975094, "grad_norm": 12.03037859002001, "learning_rate": 8.982277896438262e-07, "loss": 0.3242, "step": 15770 }, { "epoch": 0.9107962252171654, "grad_norm": 7.140437657447056, "learning_rate": 8.924551174738787e-07, "loss": 0.3333, "step": 15780 }, { "epoch": 0.9113734091368214, "grad_norm": 8.294728901327792, "learning_rate": 8.866824453039313e-07, "loss": 0.3267, "step": 15790 }, { "epoch": 0.9119505930564774, "grad_norm": 5.258892430294796, "learning_rate": 8.809097731339839e-07, "loss": 0.3306, "step": 15800 }, { "epoch": 0.9125277769761334, "grad_norm": 4.56809810586393, "learning_rate": 8.751371009640364e-07, "loss": 0.3354, "step": 15810 }, { "epoch": 0.9131049608957894, "grad_norm": 8.754714473224658, "learning_rate": 8.693644287940889e-07, "loss": 0.3258, "step": 15820 }, { "epoch": 0.9136821448154454, "grad_norm": 10.722318127648052, "learning_rate": 8.635917566241415e-07, "loss": 0.3251, "step": 15830 }, { "epoch": 0.9142593287351014, "grad_norm": 17.100240147200765, "learning_rate": 8.578190844541939e-07, "loss": 0.3243, "step": 15840 }, { "epoch": 0.9148365126547574, "grad_norm": 6.487613210408211, "learning_rate": 8.520464122842464e-07, "loss": 0.3299, "step": 15850 }, { "epoch": 0.9154136965744134, "grad_norm": 3.047470063867609, "learning_rate": 8.462737401142989e-07, "loss": 0.3277, "step": 15860 }, { "epoch": 0.9159908804940694, "grad_norm": 3.3099261534656823, "learning_rate": 8.405010679443515e-07, "loss": 0.3225, "step": 15870 }, { "epoch": 0.9165680644137254, "grad_norm": 12.904829402744845, "learning_rate": 8.34728395774404e-07, "loss": 0.3456, "step": 15880 }, { "epoch": 0.9171452483333814, "grad_norm": 3.874918781355711, "learning_rate": 8.289557236044565e-07, "loss": 0.324, "step": 15890 }, { "epoch": 0.9177224322530374, "grad_norm": 8.856329412411933, "learning_rate": 8.231830514345091e-07, "loss": 0.3336, "step": 15900 }, { "epoch": 0.9182996161726934, "grad_norm": 8.142861706815804, "learning_rate": 8.174103792645616e-07, "loss": 0.3346, "step": 15910 }, { "epoch": 0.9188768000923494, "grad_norm": 6.512456297032329, "learning_rate": 8.116377070946142e-07, "loss": 0.3356, "step": 15920 }, { "epoch": 0.9194539840120054, "grad_norm": 5.20827680094837, "learning_rate": 8.058650349246668e-07, "loss": 0.327, "step": 15930 }, { "epoch": 0.9200311679316614, "grad_norm": 3.6566931406552166, "learning_rate": 8.000923627547193e-07, "loss": 0.3393, "step": 15940 }, { "epoch": 0.9206083518513174, "grad_norm": 5.430549059652793, "learning_rate": 7.943196905847718e-07, "loss": 0.336, "step": 15950 }, { "epoch": 0.9211855357709734, "grad_norm": 11.48641695737308, "learning_rate": 7.885470184148243e-07, "loss": 0.3239, "step": 15960 }, { "epoch": 0.9217627196906294, "grad_norm": 5.74247518929047, "learning_rate": 7.827743462448769e-07, "loss": 0.3272, "step": 15970 }, { "epoch": 0.9223399036102854, "grad_norm": 2.9972434937325954, "learning_rate": 7.770016740749293e-07, "loss": 0.3444, "step": 15980 }, { "epoch": 0.9229170875299414, "grad_norm": 4.3485773330395405, "learning_rate": 7.712290019049818e-07, "loss": 0.3343, "step": 15990 }, { "epoch": 0.9234942714495974, "grad_norm": 5.8420315281490725, "learning_rate": 7.654563297350344e-07, "loss": 0.3418, "step": 16000 }, { "epoch": 0.9240714553692534, "grad_norm": 3.3727101894208924, "learning_rate": 7.596836575650869e-07, "loss": 0.3313, "step": 16010 }, { "epoch": 0.9246486392889094, "grad_norm": 4.3271834892078305, "learning_rate": 7.539109853951394e-07, "loss": 0.3263, "step": 16020 }, { "epoch": 0.9252258232085654, "grad_norm": 4.114539060448778, "learning_rate": 7.481383132251919e-07, "loss": 0.3494, "step": 16030 }, { "epoch": 0.9258030071282214, "grad_norm": 3.6454496106451946, "learning_rate": 7.423656410552445e-07, "loss": 0.3287, "step": 16040 }, { "epoch": 0.9263801910478774, "grad_norm": 3.738978996136776, "learning_rate": 7.365929688852971e-07, "loss": 0.3463, "step": 16050 }, { "epoch": 0.9269573749675334, "grad_norm": 4.789282815458411, "learning_rate": 7.308202967153497e-07, "loss": 0.3237, "step": 16060 }, { "epoch": 0.9275345588871894, "grad_norm": 9.761730219338645, "learning_rate": 7.250476245454022e-07, "loss": 0.3469, "step": 16070 }, { "epoch": 0.9281117428068454, "grad_norm": 3.028208359321862, "learning_rate": 7.192749523754547e-07, "loss": 0.3355, "step": 16080 }, { "epoch": 0.9286889267265014, "grad_norm": 18.45944225954803, "learning_rate": 7.135022802055072e-07, "loss": 0.3177, "step": 16090 }, { "epoch": 0.9292661106461574, "grad_norm": 7.253195406338047, "learning_rate": 7.077296080355598e-07, "loss": 0.3439, "step": 16100 }, { "epoch": 0.9298432945658134, "grad_norm": 4.739735623729803, "learning_rate": 7.019569358656123e-07, "loss": 0.3357, "step": 16110 }, { "epoch": 0.9304204784854694, "grad_norm": 7.262658023729907, "learning_rate": 6.961842636956648e-07, "loss": 0.3307, "step": 16120 }, { "epoch": 0.9309976624051254, "grad_norm": 4.3276336459159275, "learning_rate": 6.904115915257172e-07, "loss": 0.3397, "step": 16130 }, { "epoch": 0.9315748463247814, "grad_norm": 4.773618213493451, "learning_rate": 6.846389193557698e-07, "loss": 0.3265, "step": 16140 }, { "epoch": 0.9321520302444374, "grad_norm": 3.469713298213091, "learning_rate": 6.788662471858223e-07, "loss": 0.3356, "step": 16150 }, { "epoch": 0.9327292141640934, "grad_norm": 2.680891067948385, "learning_rate": 6.730935750158748e-07, "loss": 0.3351, "step": 16160 }, { "epoch": 0.9333063980837494, "grad_norm": 4.795647821005584, "learning_rate": 6.673209028459274e-07, "loss": 0.316, "step": 16170 }, { "epoch": 0.9338835820034054, "grad_norm": 4.072329465945383, "learning_rate": 6.615482306759799e-07, "loss": 0.3323, "step": 16180 }, { "epoch": 0.9344607659230614, "grad_norm": 6.139834777308556, "learning_rate": 6.557755585060325e-07, "loss": 0.3389, "step": 16190 }, { "epoch": 0.9350379498427174, "grad_norm": 6.679020147433282, "learning_rate": 6.500028863360851e-07, "loss": 0.3213, "step": 16200 }, { "epoch": 0.9356151337623734, "grad_norm": 3.931169778597499, "learning_rate": 6.442302141661376e-07, "loss": 0.3202, "step": 16210 }, { "epoch": 0.9361923176820294, "grad_norm": 9.793848920416451, "learning_rate": 6.384575419961901e-07, "loss": 0.3312, "step": 16220 }, { "epoch": 0.9367695016016854, "grad_norm": 3.165076810544466, "learning_rate": 6.326848698262427e-07, "loss": 0.3268, "step": 16230 }, { "epoch": 0.9373466855213414, "grad_norm": 11.613653214904037, "learning_rate": 6.269121976562952e-07, "loss": 0.3395, "step": 16240 }, { "epoch": 0.9379238694409974, "grad_norm": 8.287793663837986, "learning_rate": 6.211395254863477e-07, "loss": 0.337, "step": 16250 }, { "epoch": 0.9385010533606534, "grad_norm": 4.919246593010951, "learning_rate": 6.153668533164002e-07, "loss": 0.3183, "step": 16260 }, { "epoch": 0.9390782372803094, "grad_norm": 12.710584575769794, "learning_rate": 6.095941811464528e-07, "loss": 0.3371, "step": 16270 }, { "epoch": 0.9396554211999654, "grad_norm": 13.642759112617313, "learning_rate": 6.038215089765054e-07, "loss": 0.3364, "step": 16280 }, { "epoch": 0.9402326051196214, "grad_norm": 3.2163528904964567, "learning_rate": 5.980488368065578e-07, "loss": 0.3482, "step": 16290 }, { "epoch": 0.9408097890392774, "grad_norm": 6.3234687340970845, "learning_rate": 5.922761646366103e-07, "loss": 0.3389, "step": 16300 }, { "epoch": 0.9413869729589334, "grad_norm": 4.810430576924547, "learning_rate": 5.865034924666628e-07, "loss": 0.3273, "step": 16310 }, { "epoch": 0.9419641568785894, "grad_norm": 16.667420292327467, "learning_rate": 5.807308202967154e-07, "loss": 0.3333, "step": 16320 }, { "epoch": 0.9425413407982454, "grad_norm": 4.476842181947663, "learning_rate": 5.749581481267679e-07, "loss": 0.3319, "step": 16330 }, { "epoch": 0.9431185247179014, "grad_norm": 2.753605309820116, "learning_rate": 5.691854759568204e-07, "loss": 0.324, "step": 16340 }, { "epoch": 0.9436957086375574, "grad_norm": 7.258998836239923, "learning_rate": 5.63412803786873e-07, "loss": 0.3317, "step": 16350 }, { "epoch": 0.9442728925572134, "grad_norm": 3.3810984203362513, "learning_rate": 5.576401316169255e-07, "loss": 0.3308, "step": 16360 }, { "epoch": 0.9448500764768694, "grad_norm": 10.448881769543355, "learning_rate": 5.518674594469781e-07, "loss": 0.3389, "step": 16370 }, { "epoch": 0.9454272603965254, "grad_norm": 7.485384669846898, "learning_rate": 5.460947872770306e-07, "loss": 0.3225, "step": 16380 }, { "epoch": 0.9460044443161814, "grad_norm": 7.048976390316521, "learning_rate": 5.403221151070831e-07, "loss": 0.3316, "step": 16390 }, { "epoch": 0.9465816282358374, "grad_norm": 5.343786757617583, "learning_rate": 5.345494429371357e-07, "loss": 0.324, "step": 16400 }, { "epoch": 0.9471588121554934, "grad_norm": 8.606885533079547, "learning_rate": 5.287767707671882e-07, "loss": 0.339, "step": 16410 }, { "epoch": 0.9477359960751494, "grad_norm": 5.1476348800105205, "learning_rate": 5.230040985972408e-07, "loss": 0.3316, "step": 16420 }, { "epoch": 0.9483131799948054, "grad_norm": 3.9518622190419386, "learning_rate": 5.172314264272933e-07, "loss": 0.3257, "step": 16430 }, { "epoch": 0.9488903639144614, "grad_norm": 4.759115152912856, "learning_rate": 5.114587542573457e-07, "loss": 0.3252, "step": 16440 }, { "epoch": 0.9494675478341174, "grad_norm": 2.3577377378728936, "learning_rate": 5.056860820873983e-07, "loss": 0.3354, "step": 16450 }, { "epoch": 0.9500447317537734, "grad_norm": 8.233491658698778, "learning_rate": 4.999134099174508e-07, "loss": 0.3269, "step": 16460 }, { "epoch": 0.9506219156734294, "grad_norm": 5.830593517325124, "learning_rate": 4.941407377475033e-07, "loss": 0.3303, "step": 16470 }, { "epoch": 0.9511990995930854, "grad_norm": 4.5976658225857205, "learning_rate": 4.883680655775559e-07, "loss": 0.3164, "step": 16480 }, { "epoch": 0.9517762835127414, "grad_norm": 2.683143832655395, "learning_rate": 4.825953934076084e-07, "loss": 0.3389, "step": 16490 }, { "epoch": 0.9523534674323973, "grad_norm": 6.256568384332184, "learning_rate": 4.7682272123766096e-07, "loss": 0.3379, "step": 16500 }, { "epoch": 0.9529306513520533, "grad_norm": 5.947037852710701, "learning_rate": 4.710500490677135e-07, "loss": 0.3336, "step": 16510 }, { "epoch": 0.9535078352717093, "grad_norm": 3.9206339656766183, "learning_rate": 4.65277376897766e-07, "loss": 0.3416, "step": 16520 }, { "epoch": 0.9540850191913653, "grad_norm": 5.305934878449426, "learning_rate": 4.5950470472781854e-07, "loss": 0.343, "step": 16530 }, { "epoch": 0.9546622031110213, "grad_norm": 4.905648955862364, "learning_rate": 4.53732032557871e-07, "loss": 0.3326, "step": 16540 }, { "epoch": 0.9552393870306773, "grad_norm": 6.934144679851784, "learning_rate": 4.479593603879236e-07, "loss": 0.3315, "step": 16550 }, { "epoch": 0.9558165709503333, "grad_norm": 6.121333752853476, "learning_rate": 4.4218668821797617e-07, "loss": 0.3337, "step": 16560 }, { "epoch": 0.9563937548699893, "grad_norm": 4.161869077945622, "learning_rate": 4.3641401604802866e-07, "loss": 0.354, "step": 16570 }, { "epoch": 0.9569709387896453, "grad_norm": 4.792938959925312, "learning_rate": 4.306413438780812e-07, "loss": 0.3385, "step": 16580 }, { "epoch": 0.9575481227093013, "grad_norm": 13.85786954380734, "learning_rate": 4.248686717081337e-07, "loss": 0.3206, "step": 16590 }, { "epoch": 0.9581253066289573, "grad_norm": 21.263443082950594, "learning_rate": 4.1909599953818624e-07, "loss": 0.3325, "step": 16600 }, { "epoch": 0.9587024905486133, "grad_norm": 3.660403999109124, "learning_rate": 4.1332332736823884e-07, "loss": 0.3227, "step": 16610 }, { "epoch": 0.9592796744682693, "grad_norm": 3.9235176913649994, "learning_rate": 4.075506551982913e-07, "loss": 0.3283, "step": 16620 }, { "epoch": 0.9598568583879253, "grad_norm": 5.6449372673837965, "learning_rate": 4.0177798302834387e-07, "loss": 0.3427, "step": 16630 }, { "epoch": 0.9604340423075813, "grad_norm": 5.248416354277083, "learning_rate": 3.9600531085839636e-07, "loss": 0.3288, "step": 16640 }, { "epoch": 0.9610112262272373, "grad_norm": 8.246345220378487, "learning_rate": 3.902326386884489e-07, "loss": 0.3354, "step": 16650 }, { "epoch": 0.9615884101468933, "grad_norm": 13.274950590494653, "learning_rate": 3.8445996651850145e-07, "loss": 0.3312, "step": 16660 }, { "epoch": 0.9621655940665493, "grad_norm": 15.537361667999631, "learning_rate": 3.7868729434855394e-07, "loss": 0.3237, "step": 16670 }, { "epoch": 0.9627427779862053, "grad_norm": 17.30099668558429, "learning_rate": 3.7291462217860654e-07, "loss": 0.3447, "step": 16680 }, { "epoch": 0.9633199619058613, "grad_norm": 6.00682814280853, "learning_rate": 3.671419500086591e-07, "loss": 0.3285, "step": 16690 }, { "epoch": 0.9638971458255173, "grad_norm": 4.337349097771177, "learning_rate": 3.613692778387116e-07, "loss": 0.3406, "step": 16700 }, { "epoch": 0.9644743297451733, "grad_norm": 5.112346889090425, "learning_rate": 3.555966056687641e-07, "loss": 0.3245, "step": 16710 }, { "epoch": 0.9650515136648293, "grad_norm": 3.333315383479396, "learning_rate": 3.498239334988166e-07, "loss": 0.323, "step": 16720 }, { "epoch": 0.9656286975844853, "grad_norm": 6.363838641104665, "learning_rate": 3.4405126132886915e-07, "loss": 0.3335, "step": 16730 }, { "epoch": 0.9662058815041413, "grad_norm": 4.4727853159969095, "learning_rate": 3.3827858915892164e-07, "loss": 0.3195, "step": 16740 }, { "epoch": 0.9667830654237973, "grad_norm": 2.9245486184268525, "learning_rate": 3.3250591698897424e-07, "loss": 0.334, "step": 16750 }, { "epoch": 0.9673602493434533, "grad_norm": 8.292114221205217, "learning_rate": 3.267332448190268e-07, "loss": 0.3417, "step": 16760 }, { "epoch": 0.9679374332631093, "grad_norm": 4.467404105277962, "learning_rate": 3.209605726490793e-07, "loss": 0.3273, "step": 16770 }, { "epoch": 0.9685146171827653, "grad_norm": 3.555136063724782, "learning_rate": 3.151879004791318e-07, "loss": 0.3319, "step": 16780 }, { "epoch": 0.9690918011024213, "grad_norm": 3.5864439964386206, "learning_rate": 3.0941522830918436e-07, "loss": 0.3151, "step": 16790 }, { "epoch": 0.9696689850220773, "grad_norm": 4.286014953806982, "learning_rate": 3.036425561392369e-07, "loss": 0.3348, "step": 16800 }, { "epoch": 0.9702461689417333, "grad_norm": 4.937397445129751, "learning_rate": 2.978698839692894e-07, "loss": 0.3447, "step": 16810 }, { "epoch": 0.9708233528613893, "grad_norm": 4.053983936820117, "learning_rate": 2.9209721179934194e-07, "loss": 0.3234, "step": 16820 }, { "epoch": 0.9714005367810453, "grad_norm": 4.668613443745286, "learning_rate": 2.863245396293945e-07, "loss": 0.3318, "step": 16830 }, { "epoch": 0.9719777207007013, "grad_norm": 4.726308017445137, "learning_rate": 2.80551867459447e-07, "loss": 0.3384, "step": 16840 }, { "epoch": 0.9725549046203573, "grad_norm": 5.20234640635383, "learning_rate": 2.747791952894995e-07, "loss": 0.3414, "step": 16850 }, { "epoch": 0.9731320885400133, "grad_norm": 5.471268202808402, "learning_rate": 2.6900652311955207e-07, "loss": 0.3158, "step": 16860 }, { "epoch": 0.9737092724596693, "grad_norm": 4.041775287210815, "learning_rate": 2.632338509496046e-07, "loss": 0.3353, "step": 16870 }, { "epoch": 0.9742864563793253, "grad_norm": 6.9340722075810515, "learning_rate": 2.574611787796571e-07, "loss": 0.3292, "step": 16880 }, { "epoch": 0.9748636402989813, "grad_norm": 5.462231359128078, "learning_rate": 2.5168850660970965e-07, "loss": 0.3382, "step": 16890 }, { "epoch": 0.9754408242186373, "grad_norm": 5.016835747194534, "learning_rate": 2.459158344397622e-07, "loss": 0.3264, "step": 16900 }, { "epoch": 0.9760180081382933, "grad_norm": 6.59783102359862, "learning_rate": 2.4014316226981474e-07, "loss": 0.3303, "step": 16910 }, { "epoch": 0.9765951920579493, "grad_norm": 11.129353025607179, "learning_rate": 2.3437049009986723e-07, "loss": 0.3196, "step": 16920 }, { "epoch": 0.9771723759776053, "grad_norm": 5.828386789897742, "learning_rate": 2.285978179299198e-07, "loss": 0.3319, "step": 16930 }, { "epoch": 0.9777495598972613, "grad_norm": 2.710691717608737, "learning_rate": 2.2282514575997232e-07, "loss": 0.3319, "step": 16940 }, { "epoch": 0.9783267438169173, "grad_norm": 5.1520597373996715, "learning_rate": 2.1705247359002483e-07, "loss": 0.3314, "step": 16950 }, { "epoch": 0.9789039277365733, "grad_norm": 3.185806720570308, "learning_rate": 2.1127980142007738e-07, "loss": 0.3296, "step": 16960 }, { "epoch": 0.9794811116562293, "grad_norm": 6.515634970555692, "learning_rate": 2.055071292501299e-07, "loss": 0.3312, "step": 16970 }, { "epoch": 0.9800582955758853, "grad_norm": 10.485461655446002, "learning_rate": 1.997344570801824e-07, "loss": 0.3422, "step": 16980 }, { "epoch": 0.9806354794955413, "grad_norm": 3.8847690688300727, "learning_rate": 1.9396178491023498e-07, "loss": 0.3283, "step": 16990 }, { "epoch": 0.9812126634151973, "grad_norm": 9.994920110996672, "learning_rate": 1.881891127402875e-07, "loss": 0.3327, "step": 17000 }, { "epoch": 0.9817898473348533, "grad_norm": 5.025058096183087, "learning_rate": 1.8241644057034002e-07, "loss": 0.3207, "step": 17010 }, { "epoch": 0.9823670312545093, "grad_norm": 11.136774459380124, "learning_rate": 1.7664376840039256e-07, "loss": 0.3233, "step": 17020 }, { "epoch": 0.9829442151741652, "grad_norm": 2.906968928719714, "learning_rate": 1.708710962304451e-07, "loss": 0.335, "step": 17030 }, { "epoch": 0.9835213990938212, "grad_norm": 47.22379036340986, "learning_rate": 1.6509842406049762e-07, "loss": 0.326, "step": 17040 }, { "epoch": 0.9840985830134772, "grad_norm": 3.5068334227447537, "learning_rate": 1.5932575189055014e-07, "loss": 0.3308, "step": 17050 }, { "epoch": 0.9846757669331332, "grad_norm": 6.058207244664307, "learning_rate": 1.5355307972060266e-07, "loss": 0.336, "step": 17060 }, { "epoch": 0.9852529508527892, "grad_norm": 14.19694786309551, "learning_rate": 1.477804075506552e-07, "loss": 0.3482, "step": 17070 }, { "epoch": 0.9858301347724452, "grad_norm": 4.351680537742745, "learning_rate": 1.4200773538070775e-07, "loss": 0.3533, "step": 17080 }, { "epoch": 0.9864073186921012, "grad_norm": 11.029248449585278, "learning_rate": 1.3623506321076027e-07, "loss": 0.3196, "step": 17090 }, { "epoch": 0.9869845026117572, "grad_norm": 5.081238129188481, "learning_rate": 1.304623910408128e-07, "loss": 0.333, "step": 17100 }, { "epoch": 0.9875616865314132, "grad_norm": 23.011375672208313, "learning_rate": 1.2468971887086533e-07, "loss": 0.3353, "step": 17110 }, { "epoch": 0.9881388704510692, "grad_norm": 3.615170936112003, "learning_rate": 1.1891704670091786e-07, "loss": 0.3293, "step": 17120 }, { "epoch": 0.9887160543707252, "grad_norm": 6.0241909835288645, "learning_rate": 1.131443745309704e-07, "loss": 0.3305, "step": 17130 }, { "epoch": 0.9892932382903812, "grad_norm": 6.4242440293309, "learning_rate": 1.0737170236102292e-07, "loss": 0.3229, "step": 17140 }, { "epoch": 0.9898704222100372, "grad_norm": 4.8207248692315465, "learning_rate": 1.0159903019107546e-07, "loss": 0.3317, "step": 17150 }, { "epoch": 0.9904476061296932, "grad_norm": 4.674342685671797, "learning_rate": 9.5826358021128e-08, "loss": 0.328, "step": 17160 }, { "epoch": 0.9910247900493492, "grad_norm": 5.524320830604144, "learning_rate": 9.005368585118051e-08, "loss": 0.3297, "step": 17170 }, { "epoch": 0.9916019739690052, "grad_norm": 6.1310872624369175, "learning_rate": 8.428101368123306e-08, "loss": 0.3342, "step": 17180 }, { "epoch": 0.9921791578886612, "grad_norm": 4.736837397124582, "learning_rate": 7.850834151128557e-08, "loss": 0.3261, "step": 17190 }, { "epoch": 0.9927563418083172, "grad_norm": 3.3135028507038498, "learning_rate": 7.273566934133812e-08, "loss": 0.3125, "step": 17200 }, { "epoch": 0.9933335257279732, "grad_norm": 4.80914101916905, "learning_rate": 6.696299717139064e-08, "loss": 0.3233, "step": 17210 }, { "epoch": 0.9939107096476292, "grad_norm": 4.178676432109751, "learning_rate": 6.119032500144317e-08, "loss": 0.316, "step": 17220 }, { "epoch": 0.9944878935672852, "grad_norm": 4.648278510632473, "learning_rate": 5.5417652831495705e-08, "loss": 0.3247, "step": 17230 }, { "epoch": 0.9950650774869412, "grad_norm": 6.4221153929916515, "learning_rate": 4.9644980661548236e-08, "loss": 0.3252, "step": 17240 }, { "epoch": 0.9956422614065972, "grad_norm": 4.57910460292981, "learning_rate": 4.3872308491600766e-08, "loss": 0.346, "step": 17250 }, { "epoch": 0.9962194453262532, "grad_norm": 6.013404774431674, "learning_rate": 3.80996363216533e-08, "loss": 0.3286, "step": 17260 }, { "epoch": 0.9967966292459092, "grad_norm": 4.660240371593313, "learning_rate": 3.232696415170583e-08, "loss": 0.3238, "step": 17270 }, { "epoch": 0.9973738131655652, "grad_norm": 3.0170050218995734, "learning_rate": 2.6554291981758356e-08, "loss": 0.3399, "step": 17280 }, { "epoch": 0.9979509970852212, "grad_norm": 3.235155684245671, "learning_rate": 2.078161981181089e-08, "loss": 0.343, "step": 17290 }, { "epoch": 0.9985281810048772, "grad_norm": 3.668276198786728, "learning_rate": 1.500894764186342e-08, "loss": 0.3299, "step": 17300 }, { "epoch": 0.9991053649245332, "grad_norm": 5.359004024139474, "learning_rate": 9.23627547191595e-09, "loss": 0.3202, "step": 17310 }, { "epoch": 0.9996825488441892, "grad_norm": 5.70686001442703, "learning_rate": 3.4636033019684815e-09, "loss": 0.3161, "step": 17320 } ], "logging_steps": 10, "max_steps": 17325, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9935909437571072.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }