diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7138 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7499861462585662, + "eval_steps": 500, + "global_step": 20301, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007388662098010602, + "grad_norm": 12.062633265062304, + "learning_rate": 4.999999984845559e-07, + "loss": 1.9234, + "step": 20 + }, + { + "epoch": 0.0014777324196021205, + "grad_norm": 7.366383713647709, + "learning_rate": 4.999993938226169e-07, + "loss": 1.8338, + "step": 40 + }, + { + "epoch": 0.002216598629403181, + "grad_norm": 1.7977538453913595, + "learning_rate": 4.999975752937336e-07, + "loss": 1.7526, + "step": 60 + }, + { + "epoch": 0.002955464839204241, + "grad_norm": 1.6391792506527756, + "learning_rate": 4.999945444231491e-07, + "loss": 1.7305, + "step": 80 + }, + { + "epoch": 0.0036943310490053015, + "grad_norm": 2.046829887141379, + "learning_rate": 4.999903012271942e-07, + "loss": 1.741, + "step": 100 + }, + { + "epoch": 0.004433197258806362, + "grad_norm": 1.8506603870837903, + "learning_rate": 4.999848457287324e-07, + "loss": 1.7129, + "step": 120 + }, + { + "epoch": 0.005172063468607422, + "grad_norm": 1.532512758048568, + "learning_rate": 4.999781779571592e-07, + "loss": 1.6774, + "step": 140 + }, + { + "epoch": 0.005910929678408482, + "grad_norm": 1.5744798799071549, + "learning_rate": 4.999702979484023e-07, + "loss": 1.7007, + "step": 160 + }, + { + "epoch": 0.006649795888209542, + "grad_norm": 3.3220978847027203, + "learning_rate": 4.999612057449209e-07, + "loss": 1.713, + "step": 180 + }, + { + "epoch": 0.007388662098010603, + "grad_norm": 1.6192232490789547, + "learning_rate": 4.999509013957061e-07, + "loss": 1.7085, + "step": 200 + }, + { + "epoch": 0.008127528307811663, + "grad_norm": 1.6102605979486695, + "learning_rate": 4.999393849562803e-07, + "loss": 1.6909, + "step": 220 + }, + { + "epoch": 0.008866394517612723, + "grad_norm": 1.7856392499507323, + "learning_rate": 4.999266564886968e-07, + "loss": 1.7105, + "step": 240 + }, + { + "epoch": 0.009605260727413783, + "grad_norm": 1.462292806282488, + "learning_rate": 4.999127160615396e-07, + "loss": 1.7254, + "step": 260 + }, + { + "epoch": 0.010344126937214844, + "grad_norm": 1.6210572507919945, + "learning_rate": 4.998975637499234e-07, + "loss": 1.7228, + "step": 280 + }, + { + "epoch": 0.011082993147015904, + "grad_norm": 1.5952635257667038, + "learning_rate": 4.998811996354924e-07, + "loss": 1.747, + "step": 300 + }, + { + "epoch": 0.011821859356816964, + "grad_norm": 1.7849552493640788, + "learning_rate": 4.998636238064202e-07, + "loss": 1.6851, + "step": 320 + }, + { + "epoch": 0.012560725566618025, + "grad_norm": 1.7748576256781579, + "learning_rate": 4.9984483635741e-07, + "loss": 1.7215, + "step": 340 + }, + { + "epoch": 0.013299591776419085, + "grad_norm": 1.5217631531152356, + "learning_rate": 4.998248373896929e-07, + "loss": 1.7062, + "step": 360 + }, + { + "epoch": 0.014038457986220144, + "grad_norm": 1.8070743437219547, + "learning_rate": 4.998036270110284e-07, + "loss": 1.7108, + "step": 380 + }, + { + "epoch": 0.014777324196021206, + "grad_norm": 1.931037650128842, + "learning_rate": 4.997812053357031e-07, + "loss": 1.6739, + "step": 400 + }, + { + "epoch": 0.015516190405822266, + "grad_norm": 2.3425787137976073, + "learning_rate": 4.997575724845303e-07, + "loss": 1.6501, + "step": 420 + }, + { + "epoch": 0.016255056615623327, + "grad_norm": 1.5819249160473718, + "learning_rate": 4.997327285848497e-07, + "loss": 1.7295, + "step": 440 + }, + { + "epoch": 0.016993922825424387, + "grad_norm": 1.780767344095751, + "learning_rate": 4.997066737705263e-07, + "loss": 1.7035, + "step": 460 + }, + { + "epoch": 0.017732789035225446, + "grad_norm": 1.502517772930168, + "learning_rate": 4.996794081819497e-07, + "loss": 1.72, + "step": 480 + }, + { + "epoch": 0.018471655245026506, + "grad_norm": 1.6230104328728192, + "learning_rate": 4.996509319660336e-07, + "loss": 1.7052, + "step": 500 + }, + { + "epoch": 0.019210521454827566, + "grad_norm": 2.5063386321134287, + "learning_rate": 4.996212452762147e-07, + "loss": 1.7111, + "step": 520 + }, + { + "epoch": 0.01994938766462863, + "grad_norm": 1.508194569170525, + "learning_rate": 4.995903482724523e-07, + "loss": 1.7116, + "step": 540 + }, + { + "epoch": 0.02068825387442969, + "grad_norm": 1.5662589803980058, + "learning_rate": 4.995582411212267e-07, + "loss": 1.6586, + "step": 560 + }, + { + "epoch": 0.021427120084230748, + "grad_norm": 1.5479171922961865, + "learning_rate": 4.995249239955392e-07, + "loss": 1.6605, + "step": 580 + }, + { + "epoch": 0.022165986294031808, + "grad_norm": 1.4441787150001577, + "learning_rate": 4.994903970749107e-07, + "loss": 1.6952, + "step": 600 + }, + { + "epoch": 0.022904852503832868, + "grad_norm": 1.8034952536565763, + "learning_rate": 4.994546605453804e-07, + "loss": 1.6928, + "step": 620 + }, + { + "epoch": 0.023643718713633927, + "grad_norm": 2.0505792045813123, + "learning_rate": 4.994177145995056e-07, + "loss": 1.6979, + "step": 640 + }, + { + "epoch": 0.02438258492343499, + "grad_norm": 1.7345451000474756, + "learning_rate": 4.993795594363599e-07, + "loss": 1.6827, + "step": 660 + }, + { + "epoch": 0.02512145113323605, + "grad_norm": 1.7265664949693813, + "learning_rate": 4.993401952615327e-07, + "loss": 1.6949, + "step": 680 + }, + { + "epoch": 0.02586031734303711, + "grad_norm": 1.5684431888117931, + "learning_rate": 4.992996222871278e-07, + "loss": 1.6725, + "step": 700 + }, + { + "epoch": 0.02659918355283817, + "grad_norm": 1.8458741005435486, + "learning_rate": 4.992578407317622e-07, + "loss": 1.6876, + "step": 720 + }, + { + "epoch": 0.02733804976263923, + "grad_norm": 1.603183220486937, + "learning_rate": 4.992148508205652e-07, + "loss": 1.7001, + "step": 740 + }, + { + "epoch": 0.02807691597244029, + "grad_norm": 1.4656870216667528, + "learning_rate": 4.991706527851766e-07, + "loss": 1.6743, + "step": 760 + }, + { + "epoch": 0.028815782182241352, + "grad_norm": 1.79341933069724, + "learning_rate": 4.991252468637465e-07, + "loss": 1.6894, + "step": 780 + }, + { + "epoch": 0.029554648392042412, + "grad_norm": 1.4496770314789245, + "learning_rate": 4.990786333009329e-07, + "loss": 1.7038, + "step": 800 + }, + { + "epoch": 0.03029351460184347, + "grad_norm": 1.757004570982493, + "learning_rate": 4.990308123479012e-07, + "loss": 1.7134, + "step": 820 + }, + { + "epoch": 0.03103238081164453, + "grad_norm": 1.5200364379437228, + "learning_rate": 4.98981784262322e-07, + "loss": 1.6698, + "step": 840 + }, + { + "epoch": 0.03177124702144559, + "grad_norm": 1.486100216095798, + "learning_rate": 4.989315493083708e-07, + "loss": 1.6896, + "step": 860 + }, + { + "epoch": 0.032510113231246654, + "grad_norm": 1.6006604995588511, + "learning_rate": 4.988801077567258e-07, + "loss": 1.6842, + "step": 880 + }, + { + "epoch": 0.03324897944104771, + "grad_norm": 1.6369118826080298, + "learning_rate": 4.988274598845665e-07, + "loss": 1.7129, + "step": 900 + }, + { + "epoch": 0.03398784565084877, + "grad_norm": 1.594714153238538, + "learning_rate": 4.987736059755724e-07, + "loss": 1.6812, + "step": 920 + }, + { + "epoch": 0.03472671186064983, + "grad_norm": 1.691349253313934, + "learning_rate": 4.987185463199215e-07, + "loss": 1.7131, + "step": 940 + }, + { + "epoch": 0.03546557807045089, + "grad_norm": 2.1466962990805385, + "learning_rate": 4.986622812142888e-07, + "loss": 1.7217, + "step": 960 + }, + { + "epoch": 0.036204444280251956, + "grad_norm": 1.6551865204895997, + "learning_rate": 4.986048109618442e-07, + "loss": 1.7179, + "step": 980 + }, + { + "epoch": 0.03694331049005301, + "grad_norm": 1.5681769699914139, + "learning_rate": 4.985461358722514e-07, + "loss": 1.6897, + "step": 1000 + }, + { + "epoch": 0.037682176699854075, + "grad_norm": 1.5623589901869384, + "learning_rate": 4.984862562616661e-07, + "loss": 1.7307, + "step": 1020 + }, + { + "epoch": 0.03842104290965513, + "grad_norm": 1.710638101923504, + "learning_rate": 4.984251724527342e-07, + "loss": 1.6815, + "step": 1040 + }, + { + "epoch": 0.039159909119456195, + "grad_norm": 2.613860202511964, + "learning_rate": 4.983628847745904e-07, + "loss": 1.6798, + "step": 1060 + }, + { + "epoch": 0.03989877532925726, + "grad_norm": 1.9956576024499864, + "learning_rate": 4.982993935628554e-07, + "loss": 1.6715, + "step": 1080 + }, + { + "epoch": 0.040637641539058314, + "grad_norm": 1.833965747583207, + "learning_rate": 4.982346991596356e-07, + "loss": 1.7044, + "step": 1100 + }, + { + "epoch": 0.04137650774885938, + "grad_norm": 1.709065067682895, + "learning_rate": 4.981688019135202e-07, + "loss": 1.6612, + "step": 1120 + }, + { + "epoch": 0.04211537395866043, + "grad_norm": 2.2904499102757767, + "learning_rate": 4.981017021795794e-07, + "loss": 1.6984, + "step": 1140 + }, + { + "epoch": 0.042854240168461497, + "grad_norm": 1.5809892282131641, + "learning_rate": 4.980334003193632e-07, + "loss": 1.672, + "step": 1160 + }, + { + "epoch": 0.04359310637826255, + "grad_norm": 1.4895408854624943, + "learning_rate": 4.979638967008983e-07, + "loss": 1.6637, + "step": 1180 + }, + { + "epoch": 0.044331972588063616, + "grad_norm": 1.6294048072820626, + "learning_rate": 4.978931916986874e-07, + "loss": 1.6604, + "step": 1200 + }, + { + "epoch": 0.04507083879786468, + "grad_norm": 2.156396711607377, + "learning_rate": 4.978212856937062e-07, + "loss": 1.678, + "step": 1220 + }, + { + "epoch": 0.045809705007665735, + "grad_norm": 1.6010232675443634, + "learning_rate": 4.977481790734016e-07, + "loss": 1.6922, + "step": 1240 + }, + { + "epoch": 0.0465485712174668, + "grad_norm": 1.4024504403885678, + "learning_rate": 4.9767387223169e-07, + "loss": 1.6538, + "step": 1260 + }, + { + "epoch": 0.047287437427267855, + "grad_norm": 1.711902948101267, + "learning_rate": 4.975983655689547e-07, + "loss": 1.6844, + "step": 1280 + }, + { + "epoch": 0.04802630363706892, + "grad_norm": 1.572788133497536, + "learning_rate": 4.975216594920441e-07, + "loss": 1.6773, + "step": 1300 + }, + { + "epoch": 0.04876516984686998, + "grad_norm": 1.5865129712420638, + "learning_rate": 4.974437544142691e-07, + "loss": 1.6794, + "step": 1320 + }, + { + "epoch": 0.04950403605667104, + "grad_norm": 1.5690014017001472, + "learning_rate": 4.973646507554012e-07, + "loss": 1.7073, + "step": 1340 + }, + { + "epoch": 0.0502429022664721, + "grad_norm": 1.5937164094810738, + "learning_rate": 4.972843489416702e-07, + "loss": 1.6958, + "step": 1360 + }, + { + "epoch": 0.05098176847627316, + "grad_norm": 1.5264901796499448, + "learning_rate": 4.972028494057619e-07, + "loss": 1.6452, + "step": 1380 + }, + { + "epoch": 0.05172063468607422, + "grad_norm": 1.7268277514753942, + "learning_rate": 4.971201525868155e-07, + "loss": 1.6944, + "step": 1400 + }, + { + "epoch": 0.052459500895875276, + "grad_norm": 1.6746090286211905, + "learning_rate": 4.970362589304216e-07, + "loss": 1.6621, + "step": 1420 + }, + { + "epoch": 0.05319836710567634, + "grad_norm": 1.5009694467436718, + "learning_rate": 4.969511688886198e-07, + "loss": 1.6797, + "step": 1440 + }, + { + "epoch": 0.0539372333154774, + "grad_norm": 1.4662106712988012, + "learning_rate": 4.968648829198958e-07, + "loss": 1.6664, + "step": 1460 + }, + { + "epoch": 0.05467609952527846, + "grad_norm": 1.5749221565087543, + "learning_rate": 4.967774014891796e-07, + "loss": 1.7086, + "step": 1480 + }, + { + "epoch": 0.05541496573507952, + "grad_norm": 1.482093097261866, + "learning_rate": 4.966887250678421e-07, + "loss": 1.7089, + "step": 1500 + }, + { + "epoch": 0.05615383194488058, + "grad_norm": 1.520435320135513, + "learning_rate": 4.965988541336936e-07, + "loss": 1.6734, + "step": 1520 + }, + { + "epoch": 0.05689269815468164, + "grad_norm": 1.5553430296104012, + "learning_rate": 4.965077891709807e-07, + "loss": 1.697, + "step": 1540 + }, + { + "epoch": 0.057631564364482704, + "grad_norm": 1.3543792401342896, + "learning_rate": 4.964155306703835e-07, + "loss": 1.6997, + "step": 1560 + }, + { + "epoch": 0.05837043057428376, + "grad_norm": 1.575031153656866, + "learning_rate": 4.963220791290132e-07, + "loss": 1.6916, + "step": 1580 + }, + { + "epoch": 0.059109296784084824, + "grad_norm": 1.669401673230416, + "learning_rate": 4.962274350504096e-07, + "loss": 1.7042, + "step": 1600 + }, + { + "epoch": 0.05984816299388588, + "grad_norm": 1.5212881661869584, + "learning_rate": 4.961315989445378e-07, + "loss": 1.667, + "step": 1620 + }, + { + "epoch": 0.06058702920368694, + "grad_norm": 1.5762244565376538, + "learning_rate": 4.960345713277863e-07, + "loss": 1.6342, + "step": 1640 + }, + { + "epoch": 0.061325895413488006, + "grad_norm": 1.5691787785228513, + "learning_rate": 4.959363527229634e-07, + "loss": 1.6525, + "step": 1660 + }, + { + "epoch": 0.06206476162328906, + "grad_norm": 1.6437209138688083, + "learning_rate": 4.958369436592948e-07, + "loss": 1.6769, + "step": 1680 + }, + { + "epoch": 0.06280362783309013, + "grad_norm": 1.553888728962609, + "learning_rate": 4.957363446724208e-07, + "loss": 1.6924, + "step": 1700 + }, + { + "epoch": 0.06354249404289118, + "grad_norm": 1.5365170274491486, + "learning_rate": 4.956345563043933e-07, + "loss": 1.6894, + "step": 1720 + }, + { + "epoch": 0.06428136025269224, + "grad_norm": 1.5197453947387185, + "learning_rate": 4.955315791036727e-07, + "loss": 1.6758, + "step": 1740 + }, + { + "epoch": 0.06502022646249331, + "grad_norm": 2.464391338240643, + "learning_rate": 4.954274136251251e-07, + "loss": 1.6332, + "step": 1760 + }, + { + "epoch": 0.06575909267229436, + "grad_norm": 1.6394501426047832, + "learning_rate": 4.953220604300198e-07, + "loss": 1.6879, + "step": 1780 + }, + { + "epoch": 0.06649795888209542, + "grad_norm": 1.6810218422818062, + "learning_rate": 4.952155200860251e-07, + "loss": 1.6724, + "step": 1800 + }, + { + "epoch": 0.06723682509189649, + "grad_norm": 1.6917107156739108, + "learning_rate": 4.951077931672067e-07, + "loss": 1.6826, + "step": 1820 + }, + { + "epoch": 0.06797569130169755, + "grad_norm": 4.580444011220939, + "learning_rate": 4.949988802540229e-07, + "loss": 1.6581, + "step": 1840 + }, + { + "epoch": 0.0687145575114986, + "grad_norm": 1.529682024928737, + "learning_rate": 4.948887819333236e-07, + "loss": 1.6649, + "step": 1860 + }, + { + "epoch": 0.06945342372129966, + "grad_norm": 1.4414816212505979, + "learning_rate": 4.947774987983449e-07, + "loss": 1.6691, + "step": 1880 + }, + { + "epoch": 0.07019228993110073, + "grad_norm": 1.9799877388187868, + "learning_rate": 4.946650314487077e-07, + "loss": 1.6937, + "step": 1900 + }, + { + "epoch": 0.07093115614090179, + "grad_norm": 1.4952225950122013, + "learning_rate": 4.945513804904131e-07, + "loss": 1.6798, + "step": 1920 + }, + { + "epoch": 0.07167002235070284, + "grad_norm": 1.5642244850204086, + "learning_rate": 4.944365465358407e-07, + "loss": 1.6783, + "step": 1940 + }, + { + "epoch": 0.07240888856050391, + "grad_norm": 1.5048141774546024, + "learning_rate": 4.943205302037432e-07, + "loss": 1.6486, + "step": 1960 + }, + { + "epoch": 0.07314775477030497, + "grad_norm": 1.4222420311354336, + "learning_rate": 4.942033321192452e-07, + "loss": 1.6868, + "step": 1980 + }, + { + "epoch": 0.07388662098010602, + "grad_norm": 1.6893784971157513, + "learning_rate": 4.940849529138383e-07, + "loss": 1.6934, + "step": 2000 + }, + { + "epoch": 0.0746254871899071, + "grad_norm": 1.9605139373755291, + "learning_rate": 4.939653932253786e-07, + "loss": 1.6537, + "step": 2020 + }, + { + "epoch": 0.07536435339970815, + "grad_norm": 1.6497175696745814, + "learning_rate": 4.938446536980829e-07, + "loss": 1.7022, + "step": 2040 + }, + { + "epoch": 0.0761032196095092, + "grad_norm": 1.6258237906038047, + "learning_rate": 4.93722734982525e-07, + "loss": 1.6925, + "step": 2060 + }, + { + "epoch": 0.07684208581931026, + "grad_norm": 1.5236446480879742, + "learning_rate": 4.935996377356326e-07, + "loss": 1.6418, + "step": 2080 + }, + { + "epoch": 0.07758095202911133, + "grad_norm": 1.5958168322212294, + "learning_rate": 4.934753626206837e-07, + "loss": 1.7259, + "step": 2100 + }, + { + "epoch": 0.07831981823891239, + "grad_norm": 2.45551144657574, + "learning_rate": 4.933499103073029e-07, + "loss": 1.7141, + "step": 2120 + }, + { + "epoch": 0.07905868444871345, + "grad_norm": 1.519298595383626, + "learning_rate": 4.932232814714576e-07, + "loss": 1.6712, + "step": 2140 + }, + { + "epoch": 0.07979755065851452, + "grad_norm": 1.6278461520508305, + "learning_rate": 4.930954767954551e-07, + "loss": 1.6784, + "step": 2160 + }, + { + "epoch": 0.08053641686831557, + "grad_norm": 1.5199514109499472, + "learning_rate": 4.92966496967938e-07, + "loss": 1.6842, + "step": 2180 + }, + { + "epoch": 0.08127528307811663, + "grad_norm": 1.8268617588637115, + "learning_rate": 4.928363426838808e-07, + "loss": 1.714, + "step": 2200 + }, + { + "epoch": 0.08201414928791768, + "grad_norm": 1.671569089879459, + "learning_rate": 4.927050146445867e-07, + "loss": 1.6693, + "step": 2220 + }, + { + "epoch": 0.08275301549771875, + "grad_norm": 1.4546842764348067, + "learning_rate": 4.92572513557683e-07, + "loss": 1.6724, + "step": 2240 + }, + { + "epoch": 0.08349188170751981, + "grad_norm": 1.5602541654624753, + "learning_rate": 4.924388401371179e-07, + "loss": 1.6715, + "step": 2260 + }, + { + "epoch": 0.08423074791732087, + "grad_norm": 1.6408350929881408, + "learning_rate": 4.923039951031562e-07, + "loss": 1.6538, + "step": 2280 + }, + { + "epoch": 0.08496961412712194, + "grad_norm": 1.3547990923859226, + "learning_rate": 4.921679791823761e-07, + "loss": 1.6639, + "step": 2300 + }, + { + "epoch": 0.08570848033692299, + "grad_norm": 1.536988279196407, + "learning_rate": 4.92030793107664e-07, + "loss": 1.6709, + "step": 2320 + }, + { + "epoch": 0.08644734654672405, + "grad_norm": 1.4484585256339257, + "learning_rate": 4.918924376182121e-07, + "loss": 1.6517, + "step": 2340 + }, + { + "epoch": 0.0871862127565251, + "grad_norm": 1.4965077209050879, + "learning_rate": 4.917529134595135e-07, + "loss": 1.6956, + "step": 2360 + }, + { + "epoch": 0.08792507896632618, + "grad_norm": 1.858590784934109, + "learning_rate": 4.916122213833584e-07, + "loss": 1.6667, + "step": 2380 + }, + { + "epoch": 0.08866394517612723, + "grad_norm": 1.6845854426029852, + "learning_rate": 4.914703621478297e-07, + "loss": 1.6392, + "step": 2400 + }, + { + "epoch": 0.08940281138592829, + "grad_norm": 1.479499322660105, + "learning_rate": 4.913273365172998e-07, + "loss": 1.6323, + "step": 2420 + }, + { + "epoch": 0.09014167759572936, + "grad_norm": 1.4475363138688357, + "learning_rate": 4.911831452624253e-07, + "loss": 1.655, + "step": 2440 + }, + { + "epoch": 0.09088054380553041, + "grad_norm": 1.4410524177419237, + "learning_rate": 4.910377891601439e-07, + "loss": 1.6977, + "step": 2460 + }, + { + "epoch": 0.09161941001533147, + "grad_norm": 1.512362377905178, + "learning_rate": 4.908912689936697e-07, + "loss": 1.6716, + "step": 2480 + }, + { + "epoch": 0.09235827622513254, + "grad_norm": 1.8697344841744916, + "learning_rate": 4.90743585552489e-07, + "loss": 1.6694, + "step": 2500 + }, + { + "epoch": 0.0930971424349336, + "grad_norm": 1.5406884179833267, + "learning_rate": 4.905947396323561e-07, + "loss": 1.7013, + "step": 2520 + }, + { + "epoch": 0.09383600864473465, + "grad_norm": 1.883868312926782, + "learning_rate": 4.904447320352891e-07, + "loss": 1.6438, + "step": 2540 + }, + { + "epoch": 0.09457487485453571, + "grad_norm": 1.5063919396389938, + "learning_rate": 4.902935635695655e-07, + "loss": 1.6341, + "step": 2560 + }, + { + "epoch": 0.09531374106433678, + "grad_norm": 2.5666944465723223, + "learning_rate": 4.901412350497177e-07, + "loss": 1.673, + "step": 2580 + }, + { + "epoch": 0.09605260727413784, + "grad_norm": 1.6864017944187357, + "learning_rate": 4.899877472965289e-07, + "loss": 1.6532, + "step": 2600 + }, + { + "epoch": 0.09679147348393889, + "grad_norm": 1.489838376943142, + "learning_rate": 4.898331011370282e-07, + "loss": 1.7006, + "step": 2620 + }, + { + "epoch": 0.09753033969373996, + "grad_norm": 1.516224235445671, + "learning_rate": 4.896772974044871e-07, + "loss": 1.664, + "step": 2640 + }, + { + "epoch": 0.09826920590354102, + "grad_norm": 1.4154240383552321, + "learning_rate": 4.895203369384138e-07, + "loss": 1.6749, + "step": 2660 + }, + { + "epoch": 0.09900807211334207, + "grad_norm": 1.8653781111338754, + "learning_rate": 4.893622205845498e-07, + "loss": 1.6255, + "step": 2680 + }, + { + "epoch": 0.09974693832314313, + "grad_norm": 1.6154619117281779, + "learning_rate": 4.892029491948642e-07, + "loss": 1.7121, + "step": 2700 + }, + { + "epoch": 0.1004858045329442, + "grad_norm": 1.6240732568528131, + "learning_rate": 4.890425236275502e-07, + "loss": 1.687, + "step": 2720 + }, + { + "epoch": 0.10122467074274526, + "grad_norm": 1.459326292962488, + "learning_rate": 4.888809447470195e-07, + "loss": 1.5967, + "step": 2740 + }, + { + "epoch": 0.10196353695254631, + "grad_norm": 1.7582112558661527, + "learning_rate": 4.887182134238989e-07, + "loss": 1.7297, + "step": 2760 + }, + { + "epoch": 0.10270240316234738, + "grad_norm": 1.7154567295687058, + "learning_rate": 4.885543305350241e-07, + "loss": 1.6881, + "step": 2780 + }, + { + "epoch": 0.10344126937214844, + "grad_norm": 1.68486225816754, + "learning_rate": 4.88389296963436e-07, + "loss": 1.6351, + "step": 2800 + }, + { + "epoch": 0.1041801355819495, + "grad_norm": 1.4658940210533413, + "learning_rate": 4.882231135983757e-07, + "loss": 1.6584, + "step": 2820 + }, + { + "epoch": 0.10491900179175055, + "grad_norm": 1.3967168353938462, + "learning_rate": 4.880557813352796e-07, + "loss": 1.6811, + "step": 2840 + }, + { + "epoch": 0.10565786800155162, + "grad_norm": 1.6648778148188543, + "learning_rate": 4.878873010757747e-07, + "loss": 1.6447, + "step": 2860 + }, + { + "epoch": 0.10639673421135268, + "grad_norm": 1.6827360384506134, + "learning_rate": 4.877176737276736e-07, + "loss": 1.6671, + "step": 2880 + }, + { + "epoch": 0.10713560042115373, + "grad_norm": 1.6125148782802161, + "learning_rate": 4.875469002049697e-07, + "loss": 1.6611, + "step": 2900 + }, + { + "epoch": 0.1078744666309548, + "grad_norm": 3.1640996826552925, + "learning_rate": 4.873749814278325e-07, + "loss": 1.6914, + "step": 2920 + }, + { + "epoch": 0.10861333284075586, + "grad_norm": 1.5756821875718683, + "learning_rate": 4.87201918322602e-07, + "loss": 1.6891, + "step": 2940 + }, + { + "epoch": 0.10935219905055692, + "grad_norm": 1.508384464413988, + "learning_rate": 4.870277118217844e-07, + "loss": 1.6765, + "step": 2960 + }, + { + "epoch": 0.11009106526035799, + "grad_norm": 1.8943879400046142, + "learning_rate": 4.868523628640468e-07, + "loss": 1.6718, + "step": 2980 + }, + { + "epoch": 0.11082993147015904, + "grad_norm": 1.5476264075937183, + "learning_rate": 4.86675872394212e-07, + "loss": 1.6384, + "step": 3000 + }, + { + "epoch": 0.1115687976799601, + "grad_norm": 1.7120101891654744, + "learning_rate": 4.864982413632537e-07, + "loss": 1.66, + "step": 3020 + }, + { + "epoch": 0.11230766388976116, + "grad_norm": 1.8834789513548644, + "learning_rate": 4.863284363107887e-07, + "loss": 1.6453, + "step": 3040 + }, + { + "epoch": 0.11304653009956223, + "grad_norm": 1.6393861847878763, + "learning_rate": 4.861485839441465e-07, + "loss": 1.6914, + "step": 3060 + }, + { + "epoch": 0.11378539630936328, + "grad_norm": 1.548505894649462, + "learning_rate": 4.859675938575391e-07, + "loss": 1.6513, + "step": 3080 + }, + { + "epoch": 0.11452426251916434, + "grad_norm": 1.7314160899998987, + "learning_rate": 4.857854670261854e-07, + "loss": 1.6652, + "step": 3100 + }, + { + "epoch": 0.11526312872896541, + "grad_norm": 1.6255645061866926, + "learning_rate": 4.856022044314289e-07, + "loss": 1.6825, + "step": 3120 + }, + { + "epoch": 0.11600199493876646, + "grad_norm": 1.7047082936180922, + "learning_rate": 4.854178070607332e-07, + "loss": 1.6571, + "step": 3140 + }, + { + "epoch": 0.11674086114856752, + "grad_norm": 1.5937691951508997, + "learning_rate": 4.852322759076762e-07, + "loss": 1.6796, + "step": 3160 + }, + { + "epoch": 0.11747972735836858, + "grad_norm": 1.5581038553350461, + "learning_rate": 4.850456119719448e-07, + "loss": 1.6237, + "step": 3180 + }, + { + "epoch": 0.11821859356816965, + "grad_norm": 1.5319442885899253, + "learning_rate": 4.848578162593298e-07, + "loss": 1.6507, + "step": 3200 + }, + { + "epoch": 0.1189574597779707, + "grad_norm": 1.7452294652307094, + "learning_rate": 4.846783629455789e-07, + "loss": 1.6334, + "step": 3220 + }, + { + "epoch": 0.11969632598777176, + "grad_norm": 1.6423452527210813, + "learning_rate": 4.844883631840362e-07, + "loss": 1.6591, + "step": 3240 + }, + { + "epoch": 0.12043519219757283, + "grad_norm": 2.3138017105742277, + "learning_rate": 4.842972346482019e-07, + "loss": 1.6693, + "step": 3260 + }, + { + "epoch": 0.12117405840737389, + "grad_norm": 1.5077648756938484, + "learning_rate": 4.841049783679233e-07, + "loss": 1.6486, + "step": 3280 + }, + { + "epoch": 0.12191292461717494, + "grad_norm": 1.4711190983794034, + "learning_rate": 4.839115953791238e-07, + "loss": 1.6881, + "step": 3300 + }, + { + "epoch": 0.12265179082697601, + "grad_norm": 4.058044242916531, + "learning_rate": 4.837170867237982e-07, + "loss": 1.6469, + "step": 3320 + }, + { + "epoch": 0.12339065703677707, + "grad_norm": 1.8109757223352017, + "learning_rate": 4.835214534500064e-07, + "loss": 1.6912, + "step": 3340 + }, + { + "epoch": 0.12412952324657812, + "grad_norm": 1.5112894099167034, + "learning_rate": 4.83324696611868e-07, + "loss": 1.6452, + "step": 3360 + }, + { + "epoch": 0.12486838945637918, + "grad_norm": 1.7532693818843224, + "learning_rate": 4.83126817269557e-07, + "loss": 1.6158, + "step": 3380 + }, + { + "epoch": 0.12560725566618025, + "grad_norm": 1.7433921276878421, + "learning_rate": 4.829278164892951e-07, + "loss": 1.6684, + "step": 3400 + }, + { + "epoch": 0.1263461218759813, + "grad_norm": 1.499971805431214, + "learning_rate": 4.827276953433474e-07, + "loss": 1.6596, + "step": 3420 + }, + { + "epoch": 0.12708498808578236, + "grad_norm": 1.5392331224579805, + "learning_rate": 4.825264549100149e-07, + "loss": 1.6411, + "step": 3440 + }, + { + "epoch": 0.12782385429558343, + "grad_norm": 1.5289257318137572, + "learning_rate": 4.823240962736303e-07, + "loss": 1.6759, + "step": 3460 + }, + { + "epoch": 0.12856272050538448, + "grad_norm": 1.5034439532563377, + "learning_rate": 4.82120620524551e-07, + "loss": 1.6405, + "step": 3480 + }, + { + "epoch": 0.12930158671518555, + "grad_norm": 1.4978715454221503, + "learning_rate": 4.81916028759154e-07, + "loss": 1.6732, + "step": 3500 + }, + { + "epoch": 0.13004045292498662, + "grad_norm": 1.45790640802375, + "learning_rate": 4.817103220798296e-07, + "loss": 1.6649, + "step": 3520 + }, + { + "epoch": 0.13077931913478766, + "grad_norm": 1.5322708095688835, + "learning_rate": 4.815035015949754e-07, + "loss": 1.6588, + "step": 3540 + }, + { + "epoch": 0.13151818534458873, + "grad_norm": 1.540513558070265, + "learning_rate": 4.812955684189904e-07, + "loss": 1.6718, + "step": 3560 + }, + { + "epoch": 0.1322570515543898, + "grad_norm": 1.4880225438470713, + "learning_rate": 4.810865236722692e-07, + "loss": 1.6313, + "step": 3580 + }, + { + "epoch": 0.13299591776419084, + "grad_norm": 1.4919528959671158, + "learning_rate": 4.808763684811959e-07, + "loss": 1.62, + "step": 3600 + }, + { + "epoch": 0.1337347839739919, + "grad_norm": 1.6101194590431924, + "learning_rate": 4.806651039781377e-07, + "loss": 1.6933, + "step": 3620 + }, + { + "epoch": 0.13447365018379298, + "grad_norm": 1.5722737602103793, + "learning_rate": 4.804527313014392e-07, + "loss": 1.6555, + "step": 3640 + }, + { + "epoch": 0.13521251639359402, + "grad_norm": 1.647937670204523, + "learning_rate": 4.802392515954161e-07, + "loss": 1.6561, + "step": 3660 + }, + { + "epoch": 0.1359513826033951, + "grad_norm": 1.6527027343392149, + "learning_rate": 4.80024666010349e-07, + "loss": 1.6747, + "step": 3680 + }, + { + "epoch": 0.13669024881319616, + "grad_norm": 1.596151179002379, + "learning_rate": 4.798089757024773e-07, + "loss": 1.6602, + "step": 3700 + }, + { + "epoch": 0.1374291150229972, + "grad_norm": 1.6359785367644735, + "learning_rate": 4.795921818339928e-07, + "loss": 1.7041, + "step": 3720 + }, + { + "epoch": 0.13816798123279828, + "grad_norm": 1.5303851327334592, + "learning_rate": 4.793742855730337e-07, + "loss": 1.6921, + "step": 3740 + }, + { + "epoch": 0.13890684744259932, + "grad_norm": 1.552833624004378, + "learning_rate": 4.79155288093678e-07, + "loss": 1.6646, + "step": 3760 + }, + { + "epoch": 0.1396457136524004, + "grad_norm": 1.5328749650552398, + "learning_rate": 4.789351905759377e-07, + "loss": 1.671, + "step": 3780 + }, + { + "epoch": 0.14038457986220146, + "grad_norm": 1.4637618775535644, + "learning_rate": 4.787139942057513e-07, + "loss": 1.6826, + "step": 3800 + }, + { + "epoch": 0.1411234460720025, + "grad_norm": 1.456698106912096, + "learning_rate": 4.784917001749791e-07, + "loss": 1.7079, + "step": 3820 + }, + { + "epoch": 0.14186231228180357, + "grad_norm": 1.4778158837226694, + "learning_rate": 4.782683096813954e-07, + "loss": 1.6673, + "step": 3840 + }, + { + "epoch": 0.14260117849160464, + "grad_norm": 1.426517743754919, + "learning_rate": 4.780438239286824e-07, + "loss": 1.6327, + "step": 3860 + }, + { + "epoch": 0.14334004470140568, + "grad_norm": 1.7717097070454197, + "learning_rate": 4.77818244126424e-07, + "loss": 1.6577, + "step": 3880 + }, + { + "epoch": 0.14407891091120675, + "grad_norm": 1.6008901431845195, + "learning_rate": 4.775915714900992e-07, + "loss": 1.6493, + "step": 3900 + }, + { + "epoch": 0.14481777712100782, + "grad_norm": 1.5377457534191892, + "learning_rate": 4.773638072410752e-07, + "loss": 1.6668, + "step": 3920 + }, + { + "epoch": 0.14555664333080887, + "grad_norm": 1.9114280227385299, + "learning_rate": 4.771349526066014e-07, + "loss": 1.6925, + "step": 3940 + }, + { + "epoch": 0.14629550954060994, + "grad_norm": 1.803899924444919, + "learning_rate": 4.769050088198021e-07, + "loss": 1.6775, + "step": 3960 + }, + { + "epoch": 0.147034375750411, + "grad_norm": 1.5100721777601815, + "learning_rate": 4.7667397711967037e-07, + "loss": 1.6181, + "step": 3980 + }, + { + "epoch": 0.14777324196021205, + "grad_norm": 1.4720945445766893, + "learning_rate": 4.764418587510615e-07, + "loss": 1.6607, + "step": 4000 + }, + { + "epoch": 0.14851210817001312, + "grad_norm": 1.569266687535282, + "learning_rate": 4.7620865496468544e-07, + "loss": 1.6829, + "step": 4020 + }, + { + "epoch": 0.1492509743798142, + "grad_norm": 1.5799540185979453, + "learning_rate": 4.7597436701710107e-07, + "loss": 1.6483, + "step": 4040 + }, + { + "epoch": 0.14998984058961523, + "grad_norm": 1.5804308168544465, + "learning_rate": 4.75738996170709e-07, + "loss": 1.6924, + "step": 4060 + }, + { + "epoch": 0.1507287067994163, + "grad_norm": 1.523398154876467, + "learning_rate": 4.7550254369374455e-07, + "loss": 1.6519, + "step": 4080 + }, + { + "epoch": 0.15146757300921734, + "grad_norm": 1.4233865381689017, + "learning_rate": 4.752650108602712e-07, + "loss": 1.664, + "step": 4100 + }, + { + "epoch": 0.1522064392190184, + "grad_norm": 1.512734811893487, + "learning_rate": 4.7502639895017366e-07, + "loss": 1.7103, + "step": 4120 + }, + { + "epoch": 0.15294530542881948, + "grad_norm": 1.5630800949377466, + "learning_rate": 4.747867092491511e-07, + "loss": 1.6531, + "step": 4140 + }, + { + "epoch": 0.15368417163862053, + "grad_norm": 1.470144612554125, + "learning_rate": 4.7454594304870977e-07, + "loss": 1.6725, + "step": 4160 + }, + { + "epoch": 0.1544230378484216, + "grad_norm": 1.6569477682445206, + "learning_rate": 4.743041016461567e-07, + "loss": 1.6998, + "step": 4180 + }, + { + "epoch": 0.15516190405822267, + "grad_norm": 1.7296103801240361, + "learning_rate": 4.7406118634459223e-07, + "loss": 1.6613, + "step": 4200 + }, + { + "epoch": 0.1559007702680237, + "grad_norm": 1.6463696442561442, + "learning_rate": 4.738171984529031e-07, + "loss": 1.6575, + "step": 4220 + }, + { + "epoch": 0.15663963647782478, + "grad_norm": 1.545869558479261, + "learning_rate": 4.7357213928575546e-07, + "loss": 1.6741, + "step": 4240 + }, + { + "epoch": 0.15737850268762585, + "grad_norm": 1.7796493147352748, + "learning_rate": 4.7332601016358773e-07, + "loss": 1.7046, + "step": 4260 + }, + { + "epoch": 0.1581173688974269, + "grad_norm": 1.5172414763731175, + "learning_rate": 4.7307881241260365e-07, + "loss": 1.6365, + "step": 4280 + }, + { + "epoch": 0.15885623510722796, + "grad_norm": 1.5470321540163943, + "learning_rate": 4.7283054736476474e-07, + "loss": 1.6844, + "step": 4300 + }, + { + "epoch": 0.15959510131702903, + "grad_norm": 1.5074962263335083, + "learning_rate": 4.725812163577835e-07, + "loss": 1.6683, + "step": 4320 + }, + { + "epoch": 0.16033396752683007, + "grad_norm": 1.5931587963454854, + "learning_rate": 4.723308207351162e-07, + "loss": 1.6972, + "step": 4340 + }, + { + "epoch": 0.16107283373663114, + "grad_norm": 1.4335946997211053, + "learning_rate": 4.720793618459553e-07, + "loss": 1.6182, + "step": 4360 + }, + { + "epoch": 0.1618116999464322, + "grad_norm": 1.9207877719443267, + "learning_rate": 4.718268410452226e-07, + "loss": 1.6777, + "step": 4380 + }, + { + "epoch": 0.16255056615623326, + "grad_norm": 1.4490578223410473, + "learning_rate": 4.7157325969356143e-07, + "loss": 1.6911, + "step": 4400 + }, + { + "epoch": 0.16328943236603433, + "grad_norm": 1.593847776562296, + "learning_rate": 4.713186191573301e-07, + "loss": 1.6927, + "step": 4420 + }, + { + "epoch": 0.16402829857583537, + "grad_norm": 1.4739123126868083, + "learning_rate": 4.7106292080859363e-07, + "loss": 1.6492, + "step": 4440 + }, + { + "epoch": 0.16476716478563644, + "grad_norm": 1.424511297941709, + "learning_rate": 4.7080616602511705e-07, + "loss": 1.6847, + "step": 4460 + }, + { + "epoch": 0.1655060309954375, + "grad_norm": 1.6007681786366288, + "learning_rate": 4.705483561903576e-07, + "loss": 1.662, + "step": 4480 + }, + { + "epoch": 0.16624489720523855, + "grad_norm": 1.55690540989863, + "learning_rate": 4.702894926934573e-07, + "loss": 1.6851, + "step": 4500 + }, + { + "epoch": 0.16698376341503962, + "grad_norm": 2.0423474735881926, + "learning_rate": 4.700295769292359e-07, + "loss": 1.6604, + "step": 4520 + }, + { + "epoch": 0.1677226296248407, + "grad_norm": 1.453355289637868, + "learning_rate": 4.6976861029818264e-07, + "loss": 1.6842, + "step": 4540 + }, + { + "epoch": 0.16846149583464173, + "grad_norm": 1.5505160972568328, + "learning_rate": 4.695065942064494e-07, + "loss": 1.6804, + "step": 4560 + }, + { + "epoch": 0.1692003620444428, + "grad_norm": 1.7608287873846744, + "learning_rate": 4.6924353006584244e-07, + "loss": 1.6595, + "step": 4580 + }, + { + "epoch": 0.16993922825424387, + "grad_norm": 1.4685283699391545, + "learning_rate": 4.689794192938156e-07, + "loss": 1.6264, + "step": 4600 + }, + { + "epoch": 0.17067809446404492, + "grad_norm": 1.7781661683868824, + "learning_rate": 4.687142633134619e-07, + "loss": 1.6875, + "step": 4620 + }, + { + "epoch": 0.17141696067384599, + "grad_norm": 1.6196809334292608, + "learning_rate": 4.6844806355350623e-07, + "loss": 1.6753, + "step": 4640 + }, + { + "epoch": 0.17215582688364706, + "grad_norm": 1.6293152376567321, + "learning_rate": 4.6818082144829787e-07, + "loss": 1.6665, + "step": 4660 + }, + { + "epoch": 0.1728946930934481, + "grad_norm": 1.510069163173277, + "learning_rate": 4.6791253843780217e-07, + "loss": 1.6697, + "step": 4680 + }, + { + "epoch": 0.17363355930324917, + "grad_norm": 1.4471973015401869, + "learning_rate": 4.676432159675933e-07, + "loss": 1.6806, + "step": 4700 + }, + { + "epoch": 0.1743724255130502, + "grad_norm": 1.7753201121195747, + "learning_rate": 4.6737285548884655e-07, + "loss": 1.6935, + "step": 4720 + }, + { + "epoch": 0.17511129172285128, + "grad_norm": 1.5222859899502188, + "learning_rate": 4.671014584583296e-07, + "loss": 1.6664, + "step": 4740 + }, + { + "epoch": 0.17585015793265235, + "grad_norm": 1.4892529478692567, + "learning_rate": 4.668290263383959e-07, + "loss": 1.6669, + "step": 4760 + }, + { + "epoch": 0.1765890241424534, + "grad_norm": 1.5841443455470228, + "learning_rate": 4.66555560596976e-07, + "loss": 1.6419, + "step": 4780 + }, + { + "epoch": 0.17732789035225446, + "grad_norm": 1.5264328160932443, + "learning_rate": 4.6628106270757e-07, + "loss": 1.6642, + "step": 4800 + }, + { + "epoch": 0.17806675656205553, + "grad_norm": 1.6887371299004348, + "learning_rate": 4.6600553414923913e-07, + "loss": 1.6387, + "step": 4820 + }, + { + "epoch": 0.17880562277185658, + "grad_norm": 1.4594422560615166, + "learning_rate": 4.657289764065985e-07, + "loss": 1.6493, + "step": 4840 + }, + { + "epoch": 0.17954448898165765, + "grad_norm": 1.6615232385858325, + "learning_rate": 4.6545139096980846e-07, + "loss": 1.6312, + "step": 4860 + }, + { + "epoch": 0.18028335519145872, + "grad_norm": 1.4161658999634517, + "learning_rate": 4.651727793345669e-07, + "loss": 1.687, + "step": 4880 + }, + { + "epoch": 0.18102222140125976, + "grad_norm": 1.4750799503852594, + "learning_rate": 4.6489314300210117e-07, + "loss": 1.6579, + "step": 4900 + }, + { + "epoch": 0.18176108761106083, + "grad_norm": 1.5823630581751142, + "learning_rate": 4.646124834791598e-07, + "loss": 1.6974, + "step": 4920 + }, + { + "epoch": 0.1824999538208619, + "grad_norm": 1.5953496527857909, + "learning_rate": 4.6433080227800476e-07, + "loss": 1.6349, + "step": 4940 + }, + { + "epoch": 0.18323882003066294, + "grad_norm": 1.8088958779925088, + "learning_rate": 4.640481009164028e-07, + "loss": 1.7021, + "step": 4960 + }, + { + "epoch": 0.183977686240464, + "grad_norm": 1.6985722383661672, + "learning_rate": 4.6376438091761776e-07, + "loss": 1.6835, + "step": 4980 + }, + { + "epoch": 0.18471655245026508, + "grad_norm": 1.5740586459999972, + "learning_rate": 4.63479643810402e-07, + "loss": 1.6778, + "step": 5000 + }, + { + "epoch": 0.18545541866006612, + "grad_norm": 1.5576615822168314, + "learning_rate": 4.631938911289884e-07, + "loss": 1.6432, + "step": 5020 + }, + { + "epoch": 0.1861942848698672, + "grad_norm": 1.4882435243374539, + "learning_rate": 4.629071244130818e-07, + "loss": 1.697, + "step": 5040 + }, + { + "epoch": 0.18693315107966824, + "grad_norm": 1.7414218611909407, + "learning_rate": 4.6261934520785135e-07, + "loss": 1.6472, + "step": 5060 + }, + { + "epoch": 0.1876720172894693, + "grad_norm": 1.5111215790202166, + "learning_rate": 4.623305550639212e-07, + "loss": 1.6814, + "step": 5080 + }, + { + "epoch": 0.18841088349927038, + "grad_norm": 1.4998930010938694, + "learning_rate": 4.6204075553736317e-07, + "loss": 1.6965, + "step": 5100 + }, + { + "epoch": 0.18914974970907142, + "grad_norm": 1.6166379161449234, + "learning_rate": 4.617499481896874e-07, + "loss": 1.6367, + "step": 5120 + }, + { + "epoch": 0.1898886159188725, + "grad_norm": 1.564060473042759, + "learning_rate": 4.6145813458783484e-07, + "loss": 1.6404, + "step": 5140 + }, + { + "epoch": 0.19062748212867356, + "grad_norm": 1.5498475055243737, + "learning_rate": 4.611653163041681e-07, + "loss": 1.64, + "step": 5160 + }, + { + "epoch": 0.1913663483384746, + "grad_norm": 1.6108037998097682, + "learning_rate": 4.6087149491646343e-07, + "loss": 1.699, + "step": 5180 + }, + { + "epoch": 0.19210521454827567, + "grad_norm": 1.6995541712978521, + "learning_rate": 4.6057667200790203e-07, + "loss": 1.6546, + "step": 5200 + }, + { + "epoch": 0.19284408075807674, + "grad_norm": 1.499037507366822, + "learning_rate": 4.6028084916706147e-07, + "loss": 1.6083, + "step": 5220 + }, + { + "epoch": 0.19358294696787778, + "grad_norm": 1.5172594570626625, + "learning_rate": 4.5998402798790704e-07, + "loss": 1.6699, + "step": 5240 + }, + { + "epoch": 0.19432181317767885, + "grad_norm": 1.4963740648019974, + "learning_rate": 4.5968621006978373e-07, + "loss": 1.6898, + "step": 5260 + }, + { + "epoch": 0.19506067938747992, + "grad_norm": 2.566805183937073, + "learning_rate": 4.5938739701740686e-07, + "loss": 1.6694, + "step": 5280 + }, + { + "epoch": 0.19579954559728097, + "grad_norm": 1.4540566793967926, + "learning_rate": 4.590875904408539e-07, + "loss": 1.6692, + "step": 5300 + }, + { + "epoch": 0.19653841180708204, + "grad_norm": 3.9730656922103447, + "learning_rate": 4.587867919555557e-07, + "loss": 1.6625, + "step": 5320 + }, + { + "epoch": 0.1972772780168831, + "grad_norm": 1.5142078546698041, + "learning_rate": 4.5848500318228774e-07, + "loss": 1.6654, + "step": 5340 + }, + { + "epoch": 0.19801614422668415, + "grad_norm": 1.7032492720795371, + "learning_rate": 4.5818222574716127e-07, + "loss": 1.7022, + "step": 5360 + }, + { + "epoch": 0.19875501043648522, + "grad_norm": 1.554191757548726, + "learning_rate": 4.578784612816149e-07, + "loss": 1.6811, + "step": 5380 + }, + { + "epoch": 0.19949387664628626, + "grad_norm": 1.4929225978552914, + "learning_rate": 4.5758897229313755e-07, + "loss": 1.6509, + "step": 5400 + }, + { + "epoch": 0.20023274285608733, + "grad_norm": 1.4628893559215694, + "learning_rate": 4.5728328783083036e-07, + "loss": 1.7302, + "step": 5420 + }, + { + "epoch": 0.2009716090658884, + "grad_norm": 1.493249123165425, + "learning_rate": 4.5699197781569844e-07, + "loss": 1.6383, + "step": 5440 + }, + { + "epoch": 0.20171047527568944, + "grad_norm": 1.600690331893774, + "learning_rate": 4.5668437961972905e-07, + "loss": 1.6189, + "step": 5460 + }, + { + "epoch": 0.2024493414854905, + "grad_norm": 1.464802503893095, + "learning_rate": 4.5637580246409934e-07, + "loss": 1.65, + "step": 5480 + }, + { + "epoch": 0.20318820769529158, + "grad_norm": 1.5375722464094912, + "learning_rate": 4.5606624801149797e-07, + "loss": 1.6546, + "step": 5500 + }, + { + "epoch": 0.20392707390509263, + "grad_norm": 1.5967568446324583, + "learning_rate": 4.5575571792987984e-07, + "loss": 1.6286, + "step": 5520 + }, + { + "epoch": 0.2046659401148937, + "grad_norm": 1.5568969231756908, + "learning_rate": 4.5544421389245646e-07, + "loss": 1.6278, + "step": 5540 + }, + { + "epoch": 0.20540480632469477, + "grad_norm": 1.5499607650206735, + "learning_rate": 4.5513173757768746e-07, + "loss": 1.6755, + "step": 5560 + }, + { + "epoch": 0.2061436725344958, + "grad_norm": 1.4823222337131237, + "learning_rate": 4.548182906692714e-07, + "loss": 1.6661, + "step": 5580 + }, + { + "epoch": 0.20688253874429688, + "grad_norm": 1.507552555113675, + "learning_rate": 4.5450387485613635e-07, + "loss": 1.6659, + "step": 5600 + }, + { + "epoch": 0.20762140495409795, + "grad_norm": 1.4811185047336115, + "learning_rate": 4.541884918324313e-07, + "loss": 1.656, + "step": 5620 + }, + { + "epoch": 0.208360271163899, + "grad_norm": 1.576191450168426, + "learning_rate": 4.538721432975168e-07, + "loss": 1.6875, + "step": 5640 + }, + { + "epoch": 0.20909913737370006, + "grad_norm": 1.7938635395127402, + "learning_rate": 4.535707194370682e-07, + "loss": 1.6646, + "step": 5660 + }, + { + "epoch": 0.2098380035835011, + "grad_norm": 1.6552255449585238, + "learning_rate": 4.532524930627744e-07, + "loss": 1.6524, + "step": 5680 + }, + { + "epoch": 0.21057686979330217, + "grad_norm": 1.7516118506092397, + "learning_rate": 4.5293330622066034e-07, + "loss": 1.6157, + "step": 5700 + }, + { + "epoch": 0.21131573600310324, + "grad_norm": 1.4545866638005132, + "learning_rate": 4.526131606305823e-07, + "loss": 1.6476, + "step": 5720 + }, + { + "epoch": 0.2120546022129043, + "grad_norm": 1.6248585310317667, + "learning_rate": 4.5229205801756273e-07, + "loss": 1.6573, + "step": 5740 + }, + { + "epoch": 0.21279346842270536, + "grad_norm": 1.41925791489552, + "learning_rate": 4.519700001117807e-07, + "loss": 1.6685, + "step": 5760 + }, + { + "epoch": 0.21353233463250643, + "grad_norm": 1.7509635950883726, + "learning_rate": 4.5164698864856257e-07, + "loss": 1.6812, + "step": 5780 + }, + { + "epoch": 0.21427120084230747, + "grad_norm": 1.4694228842841779, + "learning_rate": 4.5132302536837273e-07, + "loss": 1.6556, + "step": 5800 + }, + { + "epoch": 0.21501006705210854, + "grad_norm": 1.553864895417105, + "learning_rate": 4.5099811201680416e-07, + "loss": 1.6883, + "step": 5820 + }, + { + "epoch": 0.2157489332619096, + "grad_norm": 1.491366651426128, + "learning_rate": 4.506722503445691e-07, + "loss": 1.6613, + "step": 5840 + }, + { + "epoch": 0.21648779947171065, + "grad_norm": 1.6466798284982602, + "learning_rate": 4.5034544210748953e-07, + "loss": 1.6497, + "step": 5860 + }, + { + "epoch": 0.21722666568151172, + "grad_norm": 1.4331846976152014, + "learning_rate": 4.5001768906648783e-07, + "loss": 1.6583, + "step": 5880 + }, + { + "epoch": 0.2179655318913128, + "grad_norm": 2.4779046528418793, + "learning_rate": 4.496889929875771e-07, + "loss": 1.6456, + "step": 5900 + }, + { + "epoch": 0.21870439810111383, + "grad_norm": 1.6613792185698004, + "learning_rate": 4.493593556418519e-07, + "loss": 1.6876, + "step": 5920 + }, + { + "epoch": 0.2194432643109149, + "grad_norm": 1.5936970250540041, + "learning_rate": 4.490287788054785e-07, + "loss": 1.6856, + "step": 5940 + }, + { + "epoch": 0.22018213052071597, + "grad_norm": 1.7774522510719284, + "learning_rate": 4.486972642596852e-07, + "loss": 1.6574, + "step": 5960 + }, + { + "epoch": 0.22092099673051702, + "grad_norm": 1.5404871158832736, + "learning_rate": 4.483648137907532e-07, + "loss": 1.6637, + "step": 5980 + }, + { + "epoch": 0.2216598629403181, + "grad_norm": 1.5238762502370415, + "learning_rate": 4.4803142919000645e-07, + "loss": 1.6526, + "step": 6000 + }, + { + "epoch": 0.22239872915011913, + "grad_norm": 1.4681103098352588, + "learning_rate": 4.4769711225380254e-07, + "loss": 1.6538, + "step": 6020 + }, + { + "epoch": 0.2231375953599202, + "grad_norm": 1.406496721553823, + "learning_rate": 4.4736186478352225e-07, + "loss": 1.6593, + "step": 6040 + }, + { + "epoch": 0.22387646156972127, + "grad_norm": 1.6502790317877305, + "learning_rate": 4.4702568858556063e-07, + "loss": 1.6946, + "step": 6060 + }, + { + "epoch": 0.2246153277795223, + "grad_norm": 1.5544958034860874, + "learning_rate": 4.466885854713169e-07, + "loss": 1.6922, + "step": 6080 + }, + { + "epoch": 0.22535419398932338, + "grad_norm": 1.35257283259656, + "learning_rate": 4.463505572571847e-07, + "loss": 1.6646, + "step": 6100 + }, + { + "epoch": 0.22609306019912445, + "grad_norm": 1.624788597950665, + "learning_rate": 4.460116057645422e-07, + "loss": 1.6464, + "step": 6120 + }, + { + "epoch": 0.2268319264089255, + "grad_norm": 1.5573729356283417, + "learning_rate": 4.4567173281974274e-07, + "loss": 1.6311, + "step": 6140 + }, + { + "epoch": 0.22757079261872656, + "grad_norm": 1.9342192243430807, + "learning_rate": 4.453309402541044e-07, + "loss": 1.6517, + "step": 6160 + }, + { + "epoch": 0.22830965882852763, + "grad_norm": 1.6525422759457808, + "learning_rate": 4.4498922990390044e-07, + "loss": 1.6584, + "step": 6180 + }, + { + "epoch": 0.22904852503832868, + "grad_norm": 1.3709737663427297, + "learning_rate": 4.446466036103493e-07, + "loss": 1.6552, + "step": 6200 + }, + { + "epoch": 0.22978739124812975, + "grad_norm": 1.7619047090616546, + "learning_rate": 4.44303063219605e-07, + "loss": 1.6515, + "step": 6220 + }, + { + "epoch": 0.23052625745793082, + "grad_norm": 1.425527104774275, + "learning_rate": 4.439586105827468e-07, + "loss": 1.7082, + "step": 6240 + }, + { + "epoch": 0.23126512366773186, + "grad_norm": 2.183066565667764, + "learning_rate": 4.436132475557693e-07, + "loss": 1.6457, + "step": 6260 + }, + { + "epoch": 0.23200398987753293, + "grad_norm": 2.5631189419788103, + "learning_rate": 4.432669759995725e-07, + "loss": 1.6441, + "step": 6280 + }, + { + "epoch": 0.232742856087334, + "grad_norm": 1.531958854525398, + "learning_rate": 4.4291979777995186e-07, + "loss": 1.6597, + "step": 6300 + }, + { + "epoch": 0.23348172229713504, + "grad_norm": 1.7334807358971334, + "learning_rate": 4.4257171476758813e-07, + "loss": 1.6189, + "step": 6320 + }, + { + "epoch": 0.2342205885069361, + "grad_norm": 1.606688663391079, + "learning_rate": 4.422227288380374e-07, + "loss": 1.6635, + "step": 6340 + }, + { + "epoch": 0.23495945471673715, + "grad_norm": 1.5504111994528522, + "learning_rate": 4.418728418717207e-07, + "loss": 1.6619, + "step": 6360 + }, + { + "epoch": 0.23569832092653822, + "grad_norm": 1.7059923161913078, + "learning_rate": 4.415220557539142e-07, + "loss": 1.6518, + "step": 6380 + }, + { + "epoch": 0.2364371871363393, + "grad_norm": 1.5282124634083587, + "learning_rate": 4.411703723747389e-07, + "loss": 1.6281, + "step": 6400 + }, + { + "epoch": 0.23717605334614034, + "grad_norm": 1.817029524914551, + "learning_rate": 4.4081779362915033e-07, + "loss": 1.6196, + "step": 6420 + }, + { + "epoch": 0.2379149195559414, + "grad_norm": 1.4287258918617316, + "learning_rate": 4.404643214169288e-07, + "loss": 1.6552, + "step": 6440 + }, + { + "epoch": 0.23865378576574248, + "grad_norm": 1.4874633967888828, + "learning_rate": 4.4010995764266845e-07, + "loss": 1.6398, + "step": 6460 + }, + { + "epoch": 0.23939265197554352, + "grad_norm": 1.721122957795877, + "learning_rate": 4.3975470421576764e-07, + "loss": 1.6512, + "step": 6480 + }, + { + "epoch": 0.2401315181853446, + "grad_norm": 1.523301573082442, + "learning_rate": 4.393985630504183e-07, + "loss": 1.6782, + "step": 6500 + }, + { + "epoch": 0.24087038439514566, + "grad_norm": 1.4599906341858953, + "learning_rate": 4.390415360655957e-07, + "loss": 1.6396, + "step": 6520 + }, + { + "epoch": 0.2416092506049467, + "grad_norm": 1.5009190844531946, + "learning_rate": 4.386836251850481e-07, + "loss": 1.648, + "step": 6540 + }, + { + "epoch": 0.24234811681474777, + "grad_norm": 1.3512220497620588, + "learning_rate": 4.3832483233728654e-07, + "loss": 1.6712, + "step": 6560 + }, + { + "epoch": 0.24308698302454884, + "grad_norm": 1.6590943419842232, + "learning_rate": 4.379651594555741e-07, + "loss": 1.6174, + "step": 6580 + }, + { + "epoch": 0.24382584923434988, + "grad_norm": 1.3956181675020793, + "learning_rate": 4.376046084779159e-07, + "loss": 1.6173, + "step": 6600 + }, + { + "epoch": 0.24456471544415095, + "grad_norm": 1.5798276517321244, + "learning_rate": 4.3724318134704826e-07, + "loss": 1.6419, + "step": 6620 + }, + { + "epoch": 0.24530358165395202, + "grad_norm": 1.4769865046542814, + "learning_rate": 4.3688088001042866e-07, + "loss": 1.6631, + "step": 6640 + }, + { + "epoch": 0.24604244786375307, + "grad_norm": 1.7571296905735259, + "learning_rate": 4.3651770642022483e-07, + "loss": 1.6615, + "step": 6660 + }, + { + "epoch": 0.24678131407355414, + "grad_norm": 10.261084539724488, + "learning_rate": 4.361536625333045e-07, + "loss": 1.6515, + "step": 6680 + }, + { + "epoch": 0.24752018028335518, + "grad_norm": 2.7070070654149956, + "learning_rate": 4.3578875031122466e-07, + "loss": 1.6584, + "step": 6700 + }, + { + "epoch": 0.24825904649315625, + "grad_norm": 1.54607876926978, + "learning_rate": 4.3542297172022126e-07, + "loss": 1.6517, + "step": 6720 + }, + { + "epoch": 0.24899791270295732, + "grad_norm": 1.3861037085930092, + "learning_rate": 4.3505632873119844e-07, + "loss": 1.6686, + "step": 6740 + }, + { + "epoch": 0.24973677891275836, + "grad_norm": 1.4161848471548175, + "learning_rate": 4.346888233197178e-07, + "loss": 1.6449, + "step": 6760 + }, + { + "epoch": 0.25047564512255943, + "grad_norm": 1.9634719417599906, + "learning_rate": 4.343204574659878e-07, + "loss": 1.6586, + "step": 6780 + }, + { + "epoch": 0.2512145113323605, + "grad_norm": 2.2362709149394835, + "learning_rate": 4.339512331548535e-07, + "loss": 1.6481, + "step": 6800 + }, + { + "epoch": 0.25195337754216157, + "grad_norm": 2.435262162446439, + "learning_rate": 4.335811523757855e-07, + "loss": 1.6751, + "step": 6820 + }, + { + "epoch": 0.2526922437519626, + "grad_norm": 1.4440630152259213, + "learning_rate": 4.3321021712286874e-07, + "loss": 1.6865, + "step": 6840 + }, + { + "epoch": 0.25343110996176366, + "grad_norm": 1.6572017188801809, + "learning_rate": 4.3283842939479297e-07, + "loss": 1.6874, + "step": 6860 + }, + { + "epoch": 0.2541699761715647, + "grad_norm": 1.6358091879473202, + "learning_rate": 4.3246579119484086e-07, + "loss": 1.6442, + "step": 6880 + }, + { + "epoch": 0.2549088423813658, + "grad_norm": 1.861949731594006, + "learning_rate": 4.3209230453087763e-07, + "loss": 1.6596, + "step": 6900 + }, + { + "epoch": 0.25564770859116687, + "grad_norm": 1.576364259347636, + "learning_rate": 4.317179714153405e-07, + "loss": 1.6409, + "step": 6920 + }, + { + "epoch": 0.25638657480096794, + "grad_norm": 1.6344350623705748, + "learning_rate": 4.3134279386522734e-07, + "loss": 1.6634, + "step": 6940 + }, + { + "epoch": 0.25712544101076895, + "grad_norm": 2.4484913186668056, + "learning_rate": 4.3096677390208606e-07, + "loss": 1.6635, + "step": 6960 + }, + { + "epoch": 0.25786430722057, + "grad_norm": 1.459583448230627, + "learning_rate": 4.3058991355200385e-07, + "loss": 1.6437, + "step": 6980 + }, + { + "epoch": 0.2586031734303711, + "grad_norm": 2.0774440428993426, + "learning_rate": 4.302122148455959e-07, + "loss": 1.6807, + "step": 7000 + }, + { + "epoch": 0.25934203964017216, + "grad_norm": 1.4906050741171306, + "learning_rate": 4.2983367981799484e-07, + "loss": 1.6477, + "step": 7020 + }, + { + "epoch": 0.26008090584997323, + "grad_norm": 1.6727105507446454, + "learning_rate": 4.294543105088395e-07, + "loss": 1.617, + "step": 7040 + }, + { + "epoch": 0.2608197720597743, + "grad_norm": 1.4754199269220696, + "learning_rate": 4.2907410896226415e-07, + "loss": 1.6391, + "step": 7060 + }, + { + "epoch": 0.2615586382695753, + "grad_norm": 1.5380802874413815, + "learning_rate": 4.2869307722688715e-07, + "loss": 1.687, + "step": 7080 + }, + { + "epoch": 0.2622975044793764, + "grad_norm": 1.6040883755814137, + "learning_rate": 4.283112173558003e-07, + "loss": 1.7171, + "step": 7100 + }, + { + "epoch": 0.26303637068917746, + "grad_norm": 2.822094109735399, + "learning_rate": 4.279285314065575e-07, + "loss": 1.6671, + "step": 7120 + }, + { + "epoch": 0.2637752368989785, + "grad_norm": 1.4328096068889253, + "learning_rate": 4.275450214411638e-07, + "loss": 1.6475, + "step": 7140 + }, + { + "epoch": 0.2645141031087796, + "grad_norm": 1.624272809516238, + "learning_rate": 4.2716068952606424e-07, + "loss": 1.693, + "step": 7160 + }, + { + "epoch": 0.2652529693185806, + "grad_norm": 1.502383886350249, + "learning_rate": 4.267755377321327e-07, + "loss": 1.6592, + "step": 7180 + }, + { + "epoch": 0.2659918355283817, + "grad_norm": 1.4780327874669796, + "learning_rate": 4.2638956813466094e-07, + "loss": 1.6273, + "step": 7200 + }, + { + "epoch": 0.26673070173818275, + "grad_norm": 1.647788340317037, + "learning_rate": 4.2600278281334683e-07, + "loss": 1.7177, + "step": 7220 + }, + { + "epoch": 0.2674695679479838, + "grad_norm": 1.4249175729696602, + "learning_rate": 4.256151838522842e-07, + "loss": 1.6134, + "step": 7240 + }, + { + "epoch": 0.2682084341577849, + "grad_norm": 1.525640467280493, + "learning_rate": 4.252267733399502e-07, + "loss": 1.6279, + "step": 7260 + }, + { + "epoch": 0.26894730036758596, + "grad_norm": 1.5643231773087998, + "learning_rate": 4.2483755336919546e-07, + "loss": 1.6319, + "step": 7280 + }, + { + "epoch": 0.269686166577387, + "grad_norm": 1.5088025290660787, + "learning_rate": 4.2444752603723185e-07, + "loss": 1.6465, + "step": 7300 + }, + { + "epoch": 0.27042503278718805, + "grad_norm": 1.690559249481047, + "learning_rate": 4.2405669344562157e-07, + "loss": 1.6597, + "step": 7320 + }, + { + "epoch": 0.2711638989969891, + "grad_norm": 1.4158777914075165, + "learning_rate": 4.236650577002658e-07, + "loss": 1.6498, + "step": 7340 + }, + { + "epoch": 0.2719027652067902, + "grad_norm": 1.4954788634515361, + "learning_rate": 4.232726209113931e-07, + "loss": 1.7073, + "step": 7360 + }, + { + "epoch": 0.27264163141659126, + "grad_norm": 1.96245857269846, + "learning_rate": 4.228793851935486e-07, + "loss": 1.6559, + "step": 7380 + }, + { + "epoch": 0.2733804976263923, + "grad_norm": 1.5534874631194424, + "learning_rate": 4.22485352665582e-07, + "loss": 1.6795, + "step": 7400 + }, + { + "epoch": 0.27411936383619334, + "grad_norm": 1.513478614204036, + "learning_rate": 4.2209052545063645e-07, + "loss": 1.6598, + "step": 7420 + }, + { + "epoch": 0.2748582300459944, + "grad_norm": 1.4981685008613979, + "learning_rate": 4.216949056761371e-07, + "loss": 1.6796, + "step": 7440 + }, + { + "epoch": 0.2755970962557955, + "grad_norm": 1.453166525310124, + "learning_rate": 4.212984954737796e-07, + "loss": 1.6547, + "step": 7460 + }, + { + "epoch": 0.27633596246559655, + "grad_norm": 1.4590359213340498, + "learning_rate": 4.2090129697951865e-07, + "loss": 1.668, + "step": 7480 + }, + { + "epoch": 0.2770748286753976, + "grad_norm": 1.5012030999873756, + "learning_rate": 4.205033123335563e-07, + "loss": 1.6253, + "step": 7500 + }, + { + "epoch": 0.27781369488519864, + "grad_norm": 1.605863135582104, + "learning_rate": 4.2010454368033075e-07, + "loss": 1.6684, + "step": 7520 + }, + { + "epoch": 0.2785525610949997, + "grad_norm": 1.9991749625802369, + "learning_rate": 4.197049931685046e-07, + "loss": 1.6403, + "step": 7540 + }, + { + "epoch": 0.2792914273048008, + "grad_norm": 1.5084206750440898, + "learning_rate": 4.193046629509533e-07, + "loss": 1.6673, + "step": 7560 + }, + { + "epoch": 0.28003029351460185, + "grad_norm": 1.6013334792913052, + "learning_rate": 4.1890355518475335e-07, + "loss": 1.6483, + "step": 7580 + }, + { + "epoch": 0.2807691597244029, + "grad_norm": 1.798812837038986, + "learning_rate": 4.185016720311712e-07, + "loss": 1.6795, + "step": 7600 + }, + { + "epoch": 0.281508025934204, + "grad_norm": 1.4900500600235345, + "learning_rate": 4.18099015655651e-07, + "loss": 1.6807, + "step": 7620 + }, + { + "epoch": 0.282246892144005, + "grad_norm": 1.6028189719479609, + "learning_rate": 4.176955882278033e-07, + "loss": 1.6596, + "step": 7640 + }, + { + "epoch": 0.28298575835380607, + "grad_norm": 1.9939881516366833, + "learning_rate": 4.1729139192139335e-07, + "loss": 1.6695, + "step": 7660 + }, + { + "epoch": 0.28372462456360714, + "grad_norm": 1.5127346940191255, + "learning_rate": 4.168864289143291e-07, + "loss": 1.7078, + "step": 7680 + }, + { + "epoch": 0.2844634907734082, + "grad_norm": 1.5284950240291668, + "learning_rate": 4.1648070138864993e-07, + "loss": 1.7175, + "step": 7700 + }, + { + "epoch": 0.2852023569832093, + "grad_norm": 1.5249438102092971, + "learning_rate": 4.1607421153051454e-07, + "loss": 1.6753, + "step": 7720 + }, + { + "epoch": 0.28594122319301035, + "grad_norm": 1.6281345917446086, + "learning_rate": 4.156669615301891e-07, + "loss": 1.6455, + "step": 7740 + }, + { + "epoch": 0.28668008940281137, + "grad_norm": 1.7327391694790744, + "learning_rate": 4.152589535820358e-07, + "loss": 1.6115, + "step": 7760 + }, + { + "epoch": 0.28741895561261244, + "grad_norm": 1.8046545180697087, + "learning_rate": 4.148501898845008e-07, + "loss": 1.6752, + "step": 7780 + }, + { + "epoch": 0.2881578218224135, + "grad_norm": 1.4479684507284691, + "learning_rate": 4.144406726401024e-07, + "loss": 1.7095, + "step": 7800 + }, + { + "epoch": 0.2888966880322146, + "grad_norm": 1.5133767331728856, + "learning_rate": 4.140304040554192e-07, + "loss": 1.6637, + "step": 7820 + }, + { + "epoch": 0.28963555424201565, + "grad_norm": 1.69526484807945, + "learning_rate": 4.1361938634107795e-07, + "loss": 1.6604, + "step": 7840 + }, + { + "epoch": 0.29037442045181666, + "grad_norm": 1.5901137640996412, + "learning_rate": 4.132076217117425e-07, + "loss": 1.7023, + "step": 7860 + }, + { + "epoch": 0.29111328666161773, + "grad_norm": 1.423118541107655, + "learning_rate": 4.1279511238610075e-07, + "loss": 1.6251, + "step": 7880 + }, + { + "epoch": 0.2918521528714188, + "grad_norm": 1.3770610046698395, + "learning_rate": 4.123818605868533e-07, + "loss": 1.6859, + "step": 7900 + }, + { + "epoch": 0.29259101908121987, + "grad_norm": 1.5512042035926865, + "learning_rate": 4.1196786854070147e-07, + "loss": 1.6682, + "step": 7920 + }, + { + "epoch": 0.29332988529102094, + "grad_norm": 1.5657764052019774, + "learning_rate": 4.115531384783352e-07, + "loss": 1.6373, + "step": 7940 + }, + { + "epoch": 0.294068751500822, + "grad_norm": 1.3977001410170469, + "learning_rate": 4.11137672634421e-07, + "loss": 1.623, + "step": 7960 + }, + { + "epoch": 0.294807617710623, + "grad_norm": 1.5471885506840533, + "learning_rate": 4.1072147324759007e-07, + "loss": 1.6359, + "step": 7980 + }, + { + "epoch": 0.2955464839204241, + "grad_norm": 1.9646501043093372, + "learning_rate": 4.103045425604257e-07, + "loss": 1.6575, + "step": 8000 + }, + { + "epoch": 0.29628535013022517, + "grad_norm": 2.4554925260754192, + "learning_rate": 4.098868828194523e-07, + "loss": 1.6505, + "step": 8020 + }, + { + "epoch": 0.29702421634002624, + "grad_norm": 1.5764440647794176, + "learning_rate": 4.0946849627512194e-07, + "loss": 1.6537, + "step": 8040 + }, + { + "epoch": 0.2977630825498273, + "grad_norm": 1.5679031999275903, + "learning_rate": 4.090493851818032e-07, + "loss": 1.6678, + "step": 8060 + }, + { + "epoch": 0.2985019487596284, + "grad_norm": 1.5427978270277976, + "learning_rate": 4.086295517977688e-07, + "loss": 1.646, + "step": 8080 + }, + { + "epoch": 0.2992408149694294, + "grad_norm": 1.6159758168642673, + "learning_rate": 4.082089983851831e-07, + "loss": 1.6543, + "step": 8100 + }, + { + "epoch": 0.29997968117923046, + "grad_norm": 1.4061897285537437, + "learning_rate": 4.0778772721009036e-07, + "loss": 1.6285, + "step": 8120 + }, + { + "epoch": 0.30071854738903153, + "grad_norm": 1.3965741494953192, + "learning_rate": 4.073657405424019e-07, + "loss": 1.6656, + "step": 8140 + }, + { + "epoch": 0.3014574135988326, + "grad_norm": 1.5484468689064121, + "learning_rate": 4.06943040655885e-07, + "loss": 1.661, + "step": 8160 + }, + { + "epoch": 0.30219627980863367, + "grad_norm": 1.5843927161871971, + "learning_rate": 4.065196298281493e-07, + "loss": 1.6622, + "step": 8180 + }, + { + "epoch": 0.3029351460184347, + "grad_norm": 1.6553065392619284, + "learning_rate": 4.0609551034063555e-07, + "loss": 1.6989, + "step": 8200 + }, + { + "epoch": 0.30367401222823576, + "grad_norm": 1.6004229625484228, + "learning_rate": 4.056706844786025e-07, + "loss": 1.6673, + "step": 8220 + }, + { + "epoch": 0.3044128784380368, + "grad_norm": 1.7218496726083523, + "learning_rate": 4.052451545311157e-07, + "loss": 1.7071, + "step": 8240 + }, + { + "epoch": 0.3051517446478379, + "grad_norm": 1.4453612541643919, + "learning_rate": 4.0481892279103375e-07, + "loss": 1.6418, + "step": 8260 + }, + { + "epoch": 0.30589061085763897, + "grad_norm": 2.0343056912272415, + "learning_rate": 4.043919915549972e-07, + "loss": 1.6406, + "step": 8280 + }, + { + "epoch": 0.30662947706744004, + "grad_norm": 1.4141851056827188, + "learning_rate": 4.0396436312341537e-07, + "loss": 1.6697, + "step": 8300 + }, + { + "epoch": 0.30736834327724105, + "grad_norm": 1.7030187367387806, + "learning_rate": 4.0353603980045434e-07, + "loss": 1.648, + "step": 8320 + }, + { + "epoch": 0.3081072094870421, + "grad_norm": 1.4580931131013146, + "learning_rate": 4.0310702389402455e-07, + "loss": 1.6738, + "step": 8340 + }, + { + "epoch": 0.3088460756968432, + "grad_norm": 1.6315260212867364, + "learning_rate": 4.0267731771576795e-07, + "loss": 1.6568, + "step": 8360 + }, + { + "epoch": 0.30958494190664426, + "grad_norm": 1.760277165218215, + "learning_rate": 4.022469235810462e-07, + "loss": 1.7044, + "step": 8380 + }, + { + "epoch": 0.31032380811644533, + "grad_norm": 1.5247483148379708, + "learning_rate": 4.0181584380892747e-07, + "loss": 1.625, + "step": 8400 + }, + { + "epoch": 0.3110626743262464, + "grad_norm": 1.6055425468824278, + "learning_rate": 4.0138408072217467e-07, + "loss": 1.6332, + "step": 8420 + }, + { + "epoch": 0.3118015405360474, + "grad_norm": 2.522263277058951, + "learning_rate": 4.009516366472323e-07, + "loss": 1.6795, + "step": 8440 + }, + { + "epoch": 0.3125404067458485, + "grad_norm": 1.4776229994815417, + "learning_rate": 4.005185139142143e-07, + "loss": 1.6675, + "step": 8460 + }, + { + "epoch": 0.31327927295564956, + "grad_norm": 1.458660936186841, + "learning_rate": 4.000847148568915e-07, + "loss": 1.661, + "step": 8480 + }, + { + "epoch": 0.3140181391654506, + "grad_norm": 1.5895551714359692, + "learning_rate": 3.9965024181267865e-07, + "loss": 1.6474, + "step": 8500 + }, + { + "epoch": 0.3147570053752517, + "grad_norm": 1.6027764846949324, + "learning_rate": 3.9921509712262237e-07, + "loss": 1.7055, + "step": 8520 + }, + { + "epoch": 0.3154958715850527, + "grad_norm": 1.4709407841933115, + "learning_rate": 3.9877928313138807e-07, + "loss": 1.6721, + "step": 8540 + }, + { + "epoch": 0.3162347377948538, + "grad_norm": 1.4461242455876133, + "learning_rate": 3.983428021872477e-07, + "loss": 1.6496, + "step": 8560 + }, + { + "epoch": 0.31697360400465485, + "grad_norm": 1.4524171700785795, + "learning_rate": 3.979056566420668e-07, + "loss": 1.6553, + "step": 8580 + }, + { + "epoch": 0.3177124702144559, + "grad_norm": 1.5057325136067627, + "learning_rate": 3.974678488512921e-07, + "loss": 1.6723, + "step": 8600 + }, + { + "epoch": 0.318451336424257, + "grad_norm": 1.4293777770249827, + "learning_rate": 3.9702938117393825e-07, + "loss": 1.6586, + "step": 8620 + }, + { + "epoch": 0.31919020263405806, + "grad_norm": 1.4212368243075615, + "learning_rate": 3.965902559725761e-07, + "loss": 1.6458, + "step": 8640 + }, + { + "epoch": 0.3199290688438591, + "grad_norm": 1.4727420961415922, + "learning_rate": 3.961504756133189e-07, + "loss": 1.6481, + "step": 8660 + }, + { + "epoch": 0.32066793505366015, + "grad_norm": 2.5900548552419895, + "learning_rate": 3.9573207959028544e-07, + "loss": 1.621, + "step": 8680 + }, + { + "epoch": 0.3214068012634612, + "grad_norm": 1.5430259080799726, + "learning_rate": 3.952910284920244e-07, + "loss": 1.6812, + "step": 8700 + }, + { + "epoch": 0.3221456674732623, + "grad_norm": 1.4794345694793534, + "learning_rate": 3.948493292364224e-07, + "loss": 1.6585, + "step": 8720 + }, + { + "epoch": 0.32288453368306336, + "grad_norm": 1.4614630552620829, + "learning_rate": 3.9440698420346246e-07, + "loss": 1.6466, + "step": 8740 + }, + { + "epoch": 0.3236233998928644, + "grad_norm": 1.4393288175430394, + "learning_rate": 3.939639957766073e-07, + "loss": 1.6215, + "step": 8760 + }, + { + "epoch": 0.32436226610266544, + "grad_norm": 2.1230018342791532, + "learning_rate": 3.9352036634278634e-07, + "loss": 1.6803, + "step": 8780 + }, + { + "epoch": 0.3251011323124665, + "grad_norm": 1.6164570568462948, + "learning_rate": 3.9307609829238297e-07, + "loss": 1.6766, + "step": 8800 + }, + { + "epoch": 0.3258399985222676, + "grad_norm": 1.4370335980422504, + "learning_rate": 3.9263119401922175e-07, + "loss": 1.6822, + "step": 8820 + }, + { + "epoch": 0.32657886473206865, + "grad_norm": 1.644081010299245, + "learning_rate": 3.9218565592055486e-07, + "loss": 1.6633, + "step": 8840 + }, + { + "epoch": 0.3273177309418697, + "grad_norm": 2.1011988058241173, + "learning_rate": 3.9173948639705027e-07, + "loss": 1.6765, + "step": 8860 + }, + { + "epoch": 0.32805659715167074, + "grad_norm": 2.151384135030328, + "learning_rate": 3.9129268785277796e-07, + "loss": 1.6465, + "step": 8880 + }, + { + "epoch": 0.3287954633614718, + "grad_norm": 1.4309025880636768, + "learning_rate": 3.908452626951972e-07, + "loss": 1.6543, + "step": 8900 + }, + { + "epoch": 0.3295343295712729, + "grad_norm": 1.8849999578121595, + "learning_rate": 3.903972133351436e-07, + "loss": 1.6514, + "step": 8920 + }, + { + "epoch": 0.33027319578107395, + "grad_norm": 1.7164685196230511, + "learning_rate": 3.8994854218681627e-07, + "loss": 1.7006, + "step": 8940 + }, + { + "epoch": 0.331012061990875, + "grad_norm": 1.4964402365248954, + "learning_rate": 3.8949925166776454e-07, + "loss": 1.6995, + "step": 8960 + }, + { + "epoch": 0.3317509282006761, + "grad_norm": 1.9725561956682367, + "learning_rate": 3.8904934419887493e-07, + "loss": 1.634, + "step": 8980 + }, + { + "epoch": 0.3324897944104771, + "grad_norm": 1.604770043849599, + "learning_rate": 3.885988222043586e-07, + "loss": 1.6307, + "step": 9000 + }, + { + "epoch": 0.33322866062027817, + "grad_norm": 1.4014528232679808, + "learning_rate": 3.881476881117376e-07, + "loss": 1.6384, + "step": 9020 + }, + { + "epoch": 0.33396752683007924, + "grad_norm": 1.5592294550988919, + "learning_rate": 3.876959443518323e-07, + "loss": 1.6893, + "step": 9040 + }, + { + "epoch": 0.3347063930398803, + "grad_norm": 1.512028885113723, + "learning_rate": 3.872662252925764e-07, + "loss": 1.6126, + "step": 9060 + }, + { + "epoch": 0.3354452592496814, + "grad_norm": 1.5167336039874841, + "learning_rate": 3.868132996855423e-07, + "loss": 1.6438, + "step": 9080 + }, + { + "epoch": 0.3361841254594824, + "grad_norm": 1.5732905269770532, + "learning_rate": 3.8635977160123356e-07, + "loss": 1.6129, + "step": 9100 + }, + { + "epoch": 0.33692299166928347, + "grad_norm": 1.6825164459147328, + "learning_rate": 3.859056434833698e-07, + "loss": 1.611, + "step": 9120 + }, + { + "epoch": 0.33766185787908454, + "grad_norm": 2.3767246380889095, + "learning_rate": 3.854509177789039e-07, + "loss": 1.6473, + "step": 9140 + }, + { + "epoch": 0.3384007240888856, + "grad_norm": 1.51475900965411, + "learning_rate": 3.8499559693800866e-07, + "loss": 1.6696, + "step": 9160 + }, + { + "epoch": 0.3391395902986867, + "grad_norm": 2.1798994146623496, + "learning_rate": 3.845396834140635e-07, + "loss": 1.6272, + "step": 9180 + }, + { + "epoch": 0.33987845650848775, + "grad_norm": 5.503662773520221, + "learning_rate": 3.8408317966364155e-07, + "loss": 1.6598, + "step": 9200 + }, + { + "epoch": 0.34061732271828876, + "grad_norm": 1.4387011677124582, + "learning_rate": 3.836260881464961e-07, + "loss": 1.6327, + "step": 9220 + }, + { + "epoch": 0.34135618892808983, + "grad_norm": 1.8647315334479582, + "learning_rate": 3.831684113255475e-07, + "loss": 1.6511, + "step": 9240 + }, + { + "epoch": 0.3420950551378909, + "grad_norm": 1.4777808537198769, + "learning_rate": 3.8271015166686987e-07, + "loss": 1.6361, + "step": 9260 + }, + { + "epoch": 0.34283392134769197, + "grad_norm": 2.045197276055339, + "learning_rate": 3.822513116396778e-07, + "loss": 1.6659, + "step": 9280 + }, + { + "epoch": 0.34357278755749304, + "grad_norm": 1.7790240681877276, + "learning_rate": 3.8179189371631307e-07, + "loss": 1.617, + "step": 9300 + }, + { + "epoch": 0.3443116537672941, + "grad_norm": 1.6594283041904447, + "learning_rate": 3.813319003722312e-07, + "loss": 1.6798, + "step": 9320 + }, + { + "epoch": 0.3450505199770951, + "grad_norm": 1.5722518111489987, + "learning_rate": 3.8087133408598837e-07, + "loss": 1.6448, + "step": 9340 + }, + { + "epoch": 0.3457893861868962, + "grad_norm": 1.3834190123625751, + "learning_rate": 3.804101973392278e-07, + "loss": 1.6937, + "step": 9360 + }, + { + "epoch": 0.34652825239669727, + "grad_norm": 2.860970712860898, + "learning_rate": 3.799484926166665e-07, + "loss": 1.6803, + "step": 9380 + }, + { + "epoch": 0.34726711860649834, + "grad_norm": 1.7303789413551895, + "learning_rate": 3.794862224060819e-07, + "loss": 1.6652, + "step": 9400 + }, + { + "epoch": 0.3480059848162994, + "grad_norm": 1.5722357665247504, + "learning_rate": 3.7902338919829854e-07, + "loss": 1.6824, + "step": 9420 + }, + { + "epoch": 0.3487448510261004, + "grad_norm": 1.4942909416069685, + "learning_rate": 3.785599954871741e-07, + "loss": 1.6334, + "step": 9440 + }, + { + "epoch": 0.3494837172359015, + "grad_norm": 1.5407701751336818, + "learning_rate": 3.7809604376958705e-07, + "loss": 1.6147, + "step": 9460 + }, + { + "epoch": 0.35022258344570256, + "grad_norm": 1.5151800327591411, + "learning_rate": 3.7763153654542187e-07, + "loss": 1.6591, + "step": 9480 + }, + { + "epoch": 0.35096144965550363, + "grad_norm": 1.5820720313790753, + "learning_rate": 3.7716647631755684e-07, + "loss": 1.6267, + "step": 9500 + }, + { + "epoch": 0.3517003158653047, + "grad_norm": 1.7136185539713005, + "learning_rate": 3.7670086559184944e-07, + "loss": 1.6443, + "step": 9520 + }, + { + "epoch": 0.3524391820751058, + "grad_norm": 1.6610072999142345, + "learning_rate": 3.7623470687712363e-07, + "loss": 1.6391, + "step": 9540 + }, + { + "epoch": 0.3531780482849068, + "grad_norm": 1.7561532016780041, + "learning_rate": 3.7576800268515615e-07, + "loss": 1.6403, + "step": 9560 + }, + { + "epoch": 0.35391691449470786, + "grad_norm": 1.6534365111706855, + "learning_rate": 3.7530075553066256e-07, + "loss": 1.6604, + "step": 9580 + }, + { + "epoch": 0.3546557807045089, + "grad_norm": 1.5197922636545014, + "learning_rate": 3.748329679312845e-07, + "loss": 1.6005, + "step": 9600 + }, + { + "epoch": 0.35539464691431, + "grad_norm": 2.1221364447575635, + "learning_rate": 3.743646424075753e-07, + "loss": 1.6302, + "step": 9620 + }, + { + "epoch": 0.35613351312411107, + "grad_norm": 1.520654127135304, + "learning_rate": 3.738957814829868e-07, + "loss": 1.7174, + "step": 9640 + }, + { + "epoch": 0.35687237933391214, + "grad_norm": 1.5099869797232601, + "learning_rate": 3.7342638768385597e-07, + "loss": 1.6592, + "step": 9660 + }, + { + "epoch": 0.35761124554371315, + "grad_norm": 1.8304484700278734, + "learning_rate": 3.729564635393907e-07, + "loss": 1.6745, + "step": 9680 + }, + { + "epoch": 0.3583501117535142, + "grad_norm": 1.778696114508267, + "learning_rate": 3.7248601158165674e-07, + "loss": 1.6592, + "step": 9700 + }, + { + "epoch": 0.3590889779633153, + "grad_norm": 1.4183327236752137, + "learning_rate": 3.720150343455638e-07, + "loss": 1.6637, + "step": 9720 + }, + { + "epoch": 0.35982784417311636, + "grad_norm": 1.559240346976758, + "learning_rate": 3.715435343688517e-07, + "loss": 1.6862, + "step": 9740 + }, + { + "epoch": 0.36056671038291743, + "grad_norm": 1.5461740842164586, + "learning_rate": 3.710715141920772e-07, + "loss": 1.6276, + "step": 9760 + }, + { + "epoch": 0.36130557659271845, + "grad_norm": 1.541024781373399, + "learning_rate": 3.705989763585998e-07, + "loss": 1.6519, + "step": 9780 + }, + { + "epoch": 0.3620444428025195, + "grad_norm": 1.568073509021964, + "learning_rate": 3.7012592341456855e-07, + "loss": 1.644, + "step": 9800 + }, + { + "epoch": 0.3627833090123206, + "grad_norm": 7.164278419276029, + "learning_rate": 3.6965235790890776e-07, + "loss": 1.6649, + "step": 9820 + }, + { + "epoch": 0.36352217522212166, + "grad_norm": 1.6290047071156604, + "learning_rate": 3.6917828239330364e-07, + "loss": 1.6321, + "step": 9840 + }, + { + "epoch": 0.3642610414319227, + "grad_norm": 2.2138525137520078, + "learning_rate": 3.6870369942219043e-07, + "loss": 1.6623, + "step": 9860 + }, + { + "epoch": 0.3649999076417238, + "grad_norm": 1.4780745550505248, + "learning_rate": 3.6822861155273664e-07, + "loss": 1.6303, + "step": 9880 + }, + { + "epoch": 0.3657387738515248, + "grad_norm": 1.6513433655082623, + "learning_rate": 3.677530213448315e-07, + "loss": 1.6678, + "step": 9900 + }, + { + "epoch": 0.3664776400613259, + "grad_norm": 1.4330452468765504, + "learning_rate": 3.6727693136107074e-07, + "loss": 1.6411, + "step": 9920 + }, + { + "epoch": 0.36721650627112695, + "grad_norm": 2.1041910204234773, + "learning_rate": 3.668241852955783e-07, + "loss": 1.6638, + "step": 9940 + }, + { + "epoch": 0.367955372480928, + "grad_norm": 1.579705325259841, + "learning_rate": 3.66347128129751e-07, + "loss": 1.6245, + "step": 9960 + }, + { + "epoch": 0.3686942386907291, + "grad_norm": 2.2840341365356185, + "learning_rate": 3.65869578763363e-07, + "loss": 1.6621, + "step": 9980 + }, + { + "epoch": 0.36943310490053016, + "grad_norm": 1.4886178225841975, + "learning_rate": 3.6539153976956643e-07, + "loss": 1.6815, + "step": 10000 + }, + { + "epoch": 0.3701719711103312, + "grad_norm": 2.0581153395070952, + "learning_rate": 3.6491301372415173e-07, + "loss": 1.6911, + "step": 10020 + }, + { + "epoch": 0.37091083732013225, + "grad_norm": 1.5433010278052928, + "learning_rate": 3.6443400320553387e-07, + "loss": 1.6726, + "step": 10040 + }, + { + "epoch": 0.3716497035299333, + "grad_norm": 1.3650733078052242, + "learning_rate": 3.6395451079473785e-07, + "loss": 1.6808, + "step": 10060 + }, + { + "epoch": 0.3723885697397344, + "grad_norm": 1.4829849508478619, + "learning_rate": 3.634745390753857e-07, + "loss": 1.638, + "step": 10080 + }, + { + "epoch": 0.37312743594953546, + "grad_norm": 1.4843368467181628, + "learning_rate": 3.6299409063368177e-07, + "loss": 1.6608, + "step": 10100 + }, + { + "epoch": 0.37386630215933647, + "grad_norm": 1.7135290138411319, + "learning_rate": 3.6251316805839925e-07, + "loss": 1.6201, + "step": 10120 + }, + { + "epoch": 0.37460516836913754, + "grad_norm": 1.4665338261705847, + "learning_rate": 3.6203177394086603e-07, + "loss": 1.6576, + "step": 10140 + }, + { + "epoch": 0.3753440345789386, + "grad_norm": 1.523807524784342, + "learning_rate": 3.615499108749508e-07, + "loss": 1.6531, + "step": 10160 + }, + { + "epoch": 0.3760829007887397, + "grad_norm": 1.4605532197043567, + "learning_rate": 3.6106758145704903e-07, + "loss": 1.6351, + "step": 10180 + }, + { + "epoch": 0.37682176699854075, + "grad_norm": 1.4767414919395185, + "learning_rate": 3.6058478828606904e-07, + "loss": 1.6816, + "step": 10200 + }, + { + "epoch": 0.3775606332083418, + "grad_norm": 3.319352148345807, + "learning_rate": 3.601015339634179e-07, + "loss": 1.646, + "step": 10220 + }, + { + "epoch": 0.37829949941814284, + "grad_norm": 1.6462705304843952, + "learning_rate": 3.5961782109298767e-07, + "loss": 1.6572, + "step": 10240 + }, + { + "epoch": 0.3790383656279439, + "grad_norm": 1.987828688877245, + "learning_rate": 3.5913365228114085e-07, + "loss": 1.6272, + "step": 10260 + }, + { + "epoch": 0.379777231837745, + "grad_norm": 1.5685525483250444, + "learning_rate": 3.5864903013669696e-07, + "loss": 1.629, + "step": 10280 + }, + { + "epoch": 0.38051609804754605, + "grad_norm": 1.454531386924792, + "learning_rate": 3.58163957270918e-07, + "loss": 1.6391, + "step": 10300 + }, + { + "epoch": 0.3812549642573471, + "grad_norm": 1.5741474691311197, + "learning_rate": 3.5767843629749465e-07, + "loss": 1.6497, + "step": 10320 + }, + { + "epoch": 0.3819938304671482, + "grad_norm": 1.494255550534897, + "learning_rate": 3.5719246983253227e-07, + "loss": 1.6584, + "step": 10340 + }, + { + "epoch": 0.3827326966769492, + "grad_norm": 1.5743114630725665, + "learning_rate": 3.5670606049453624e-07, + "loss": 1.6333, + "step": 10360 + }, + { + "epoch": 0.3834715628867503, + "grad_norm": 1.5229234435536247, + "learning_rate": 3.5621921090439856e-07, + "loss": 1.651, + "step": 10380 + }, + { + "epoch": 0.38421042909655134, + "grad_norm": 1.5784429804907898, + "learning_rate": 3.557319236853833e-07, + "loss": 1.6922, + "step": 10400 + }, + { + "epoch": 0.3849492953063524, + "grad_norm": 1.581472732564025, + "learning_rate": 3.552442014631125e-07, + "loss": 1.6725, + "step": 10420 + }, + { + "epoch": 0.3856881615161535, + "grad_norm": 1.5126802451542531, + "learning_rate": 3.5475604686555246e-07, + "loss": 1.6944, + "step": 10440 + }, + { + "epoch": 0.3864270277259545, + "grad_norm": 1.5957042160618131, + "learning_rate": 3.5426746252299876e-07, + "loss": 1.6474, + "step": 10460 + }, + { + "epoch": 0.38716589393575557, + "grad_norm": 1.5167798574452542, + "learning_rate": 3.537784510680629e-07, + "loss": 1.6269, + "step": 10480 + }, + { + "epoch": 0.38790476014555664, + "grad_norm": 1.4073803006779033, + "learning_rate": 3.5328901513565755e-07, + "loss": 1.667, + "step": 10500 + }, + { + "epoch": 0.3886436263553577, + "grad_norm": 1.5025049762633182, + "learning_rate": 3.527991573629826e-07, + "loss": 1.6685, + "step": 10520 + }, + { + "epoch": 0.3893824925651588, + "grad_norm": 1.498817940042482, + "learning_rate": 3.523088803895111e-07, + "loss": 1.6693, + "step": 10540 + }, + { + "epoch": 0.39012135877495985, + "grad_norm": 1.5375475807699233, + "learning_rate": 3.5181818685697454e-07, + "loss": 1.6257, + "step": 10560 + }, + { + "epoch": 0.39086022498476086, + "grad_norm": 1.4788669954107543, + "learning_rate": 3.513270794093493e-07, + "loss": 1.6396, + "step": 10580 + }, + { + "epoch": 0.39159909119456193, + "grad_norm": 1.8280175785471986, + "learning_rate": 3.508355606928417e-07, + "loss": 1.6708, + "step": 10600 + }, + { + "epoch": 0.392337957404363, + "grad_norm": 1.657327382022486, + "learning_rate": 3.503436333558744e-07, + "loss": 1.6344, + "step": 10620 + }, + { + "epoch": 0.3930768236141641, + "grad_norm": 3.2933368891799772, + "learning_rate": 3.498513000490713e-07, + "loss": 1.6233, + "step": 10640 + }, + { + "epoch": 0.39381568982396514, + "grad_norm": 1.5787521448516106, + "learning_rate": 3.4935856342524445e-07, + "loss": 1.6504, + "step": 10660 + }, + { + "epoch": 0.3945545560337662, + "grad_norm": 1.7273082957996757, + "learning_rate": 3.488654261393786e-07, + "loss": 1.6501, + "step": 10680 + }, + { + "epoch": 0.3952934222435672, + "grad_norm": 1.5427159019633168, + "learning_rate": 3.483718908486173e-07, + "loss": 1.6213, + "step": 10700 + }, + { + "epoch": 0.3960322884533683, + "grad_norm": 2.4791279004019944, + "learning_rate": 3.478779602122491e-07, + "loss": 1.6341, + "step": 10720 + }, + { + "epoch": 0.39677115466316937, + "grad_norm": 1.5057908958686839, + "learning_rate": 3.4738363689169227e-07, + "loss": 1.6344, + "step": 10740 + }, + { + "epoch": 0.39751002087297044, + "grad_norm": 1.6211537727930727, + "learning_rate": 3.4688892355048133e-07, + "loss": 1.6684, + "step": 10760 + }, + { + "epoch": 0.3982488870827715, + "grad_norm": 1.7112433425010558, + "learning_rate": 3.4639382285425217e-07, + "loss": 1.6742, + "step": 10780 + }, + { + "epoch": 0.3989877532925725, + "grad_norm": 1.7626819549867558, + "learning_rate": 3.4589833747072765e-07, + "loss": 1.6497, + "step": 10800 + }, + { + "epoch": 0.3997266195023736, + "grad_norm": 1.536514259186305, + "learning_rate": 3.4540247006970395e-07, + "loss": 1.6533, + "step": 10820 + }, + { + "epoch": 0.40046548571217466, + "grad_norm": 1.4352156142464503, + "learning_rate": 3.449062233230351e-07, + "loss": 1.6423, + "step": 10840 + }, + { + "epoch": 0.40120435192197573, + "grad_norm": 1.517870844401341, + "learning_rate": 3.4440959990461936e-07, + "loss": 1.6888, + "step": 10860 + }, + { + "epoch": 0.4019432181317768, + "grad_norm": 1.6903764999597104, + "learning_rate": 3.4391260249038467e-07, + "loss": 1.6242, + "step": 10880 + }, + { + "epoch": 0.4026820843415779, + "grad_norm": 1.9353070894961153, + "learning_rate": 3.4341523375827407e-07, + "loss": 1.6219, + "step": 10900 + }, + { + "epoch": 0.4034209505513789, + "grad_norm": 1.70733565978221, + "learning_rate": 3.4291749638823144e-07, + "loss": 1.6524, + "step": 10920 + }, + { + "epoch": 0.40415981676117996, + "grad_norm": 1.3794756923120337, + "learning_rate": 3.4241939306218655e-07, + "loss": 1.647, + "step": 10940 + }, + { + "epoch": 0.404898682970981, + "grad_norm": 1.4536895089620647, + "learning_rate": 3.4192092646404166e-07, + "loss": 1.6697, + "step": 10960 + }, + { + "epoch": 0.4056375491807821, + "grad_norm": 1.4185925084451405, + "learning_rate": 3.41422099279656e-07, + "loss": 1.6916, + "step": 10980 + }, + { + "epoch": 0.40637641539058317, + "grad_norm": 1.5516883391882288, + "learning_rate": 3.40922914196832e-07, + "loss": 1.6702, + "step": 11000 + }, + { + "epoch": 0.40711528160038424, + "grad_norm": 1.500896700694977, + "learning_rate": 3.4042337390530027e-07, + "loss": 1.6379, + "step": 11020 + }, + { + "epoch": 0.40785414781018525, + "grad_norm": 1.4488842610705819, + "learning_rate": 3.399234810967055e-07, + "loss": 1.6322, + "step": 11040 + }, + { + "epoch": 0.4085930140199863, + "grad_norm": 1.5363179452812292, + "learning_rate": 3.394232384645918e-07, + "loss": 1.7085, + "step": 11060 + }, + { + "epoch": 0.4093318802297874, + "grad_norm": 1.6587795154693055, + "learning_rate": 3.389226487043883e-07, + "loss": 1.6212, + "step": 11080 + }, + { + "epoch": 0.41007074643958846, + "grad_norm": 2.185811847037595, + "learning_rate": 3.3842171451339446e-07, + "loss": 1.653, + "step": 11100 + }, + { + "epoch": 0.41080961264938953, + "grad_norm": 1.4930598472252423, + "learning_rate": 3.3792043859076556e-07, + "loss": 1.6401, + "step": 11120 + }, + { + "epoch": 0.41154847885919055, + "grad_norm": 1.585267885050689, + "learning_rate": 3.3741882363749836e-07, + "loss": 1.6081, + "step": 11140 + }, + { + "epoch": 0.4122873450689916, + "grad_norm": 1.5745770350836434, + "learning_rate": 3.3691687235641633e-07, + "loss": 1.6657, + "step": 11160 + }, + { + "epoch": 0.4130262112787927, + "grad_norm": 1.638169374979827, + "learning_rate": 3.364145874521552e-07, + "loss": 1.6439, + "step": 11180 + }, + { + "epoch": 0.41376507748859376, + "grad_norm": 1.5771694576157802, + "learning_rate": 3.3591197163114807e-07, + "loss": 1.6344, + "step": 11200 + }, + { + "epoch": 0.41450394369839483, + "grad_norm": 1.507624879108444, + "learning_rate": 3.3540902760161153e-07, + "loss": 1.6414, + "step": 11220 + }, + { + "epoch": 0.4152428099081959, + "grad_norm": 1.5517359392564993, + "learning_rate": 3.349057580735304e-07, + "loss": 1.6103, + "step": 11240 + }, + { + "epoch": 0.4159816761179969, + "grad_norm": 1.6910189529581492, + "learning_rate": 3.3440216575864336e-07, + "loss": 1.6097, + "step": 11260 + }, + { + "epoch": 0.416720542327798, + "grad_norm": 1.4817048826322234, + "learning_rate": 3.338982533704284e-07, + "loss": 1.6322, + "step": 11280 + }, + { + "epoch": 0.41745940853759905, + "grad_norm": 2.4572073331823843, + "learning_rate": 3.3339402362408803e-07, + "loss": 1.6818, + "step": 11300 + }, + { + "epoch": 0.4181982747474001, + "grad_norm": 1.4690103698141457, + "learning_rate": 3.32889479236535e-07, + "loss": 1.6734, + "step": 11320 + }, + { + "epoch": 0.4189371409572012, + "grad_norm": 1.4525562290767953, + "learning_rate": 3.323846229263772e-07, + "loss": 1.6777, + "step": 11340 + }, + { + "epoch": 0.4196760071670022, + "grad_norm": 1.6088576080590102, + "learning_rate": 3.318794574139033e-07, + "loss": 1.6815, + "step": 11360 + }, + { + "epoch": 0.4204148733768033, + "grad_norm": 1.658735344378412, + "learning_rate": 3.3137398542106816e-07, + "loss": 1.7156, + "step": 11380 + }, + { + "epoch": 0.42115373958660435, + "grad_norm": 1.856711421074202, + "learning_rate": 3.308682096714777e-07, + "loss": 1.6056, + "step": 11400 + }, + { + "epoch": 0.4218926057964054, + "grad_norm": 1.524820866790581, + "learning_rate": 3.3036213289037494e-07, + "loss": 1.653, + "step": 11420 + }, + { + "epoch": 0.4226314720062065, + "grad_norm": 2.091225075765613, + "learning_rate": 3.298557578046248e-07, + "loss": 1.6344, + "step": 11440 + }, + { + "epoch": 0.42337033821600756, + "grad_norm": 1.5873899994137428, + "learning_rate": 3.2934908714269926e-07, + "loss": 1.7056, + "step": 11460 + }, + { + "epoch": 0.4241092044258086, + "grad_norm": 1.530785170405434, + "learning_rate": 3.2884212363466336e-07, + "loss": 1.6592, + "step": 11480 + }, + { + "epoch": 0.42484807063560964, + "grad_norm": 1.4187769683759475, + "learning_rate": 3.283348700121599e-07, + "loss": 1.6155, + "step": 11500 + }, + { + "epoch": 0.4255869368454107, + "grad_norm": 1.7098484503844666, + "learning_rate": 3.278273290083948e-07, + "loss": 1.6145, + "step": 11520 + }, + { + "epoch": 0.4263258030552118, + "grad_norm": 1.6337855300981592, + "learning_rate": 3.2731950335812245e-07, + "loss": 1.6718, + "step": 11540 + }, + { + "epoch": 0.42706466926501285, + "grad_norm": 1.562376692174843, + "learning_rate": 3.2681139579763116e-07, + "loss": 1.6299, + "step": 11560 + }, + { + "epoch": 0.4278035354748139, + "grad_norm": 1.7830680336877842, + "learning_rate": 3.263030090647282e-07, + "loss": 1.6427, + "step": 11580 + }, + { + "epoch": 0.42854240168461494, + "grad_norm": 1.67004917671626, + "learning_rate": 3.2579434589872487e-07, + "loss": 1.6645, + "step": 11600 + }, + { + "epoch": 0.429281267894416, + "grad_norm": 1.6704228734275928, + "learning_rate": 3.2528540904042226e-07, + "loss": 1.6427, + "step": 11620 + }, + { + "epoch": 0.4300201341042171, + "grad_norm": 1.4195450351330696, + "learning_rate": 3.24776201232096e-07, + "loss": 1.62, + "step": 11640 + }, + { + "epoch": 0.43075900031401815, + "grad_norm": 1.5285023969215334, + "learning_rate": 3.242667252174816e-07, + "loss": 1.6654, + "step": 11660 + }, + { + "epoch": 0.4314978665238192, + "grad_norm": 1.4602369388272751, + "learning_rate": 3.2375698374176e-07, + "loss": 1.6073, + "step": 11680 + }, + { + "epoch": 0.43223673273362023, + "grad_norm": 1.4791375841387864, + "learning_rate": 3.232469795515423e-07, + "loss": 1.6277, + "step": 11700 + }, + { + "epoch": 0.4329755989434213, + "grad_norm": 1.4365509577307647, + "learning_rate": 3.227367153948551e-07, + "loss": 1.6678, + "step": 11720 + }, + { + "epoch": 0.4337144651532224, + "grad_norm": 1.4925933032216425, + "learning_rate": 3.22226194021126e-07, + "loss": 1.6138, + "step": 11740 + }, + { + "epoch": 0.43445333136302344, + "grad_norm": 1.5965165214882902, + "learning_rate": 3.2171541818116844e-07, + "loss": 1.682, + "step": 11760 + }, + { + "epoch": 0.4351921975728245, + "grad_norm": 1.622561586319955, + "learning_rate": 3.2120439062716673e-07, + "loss": 1.6685, + "step": 11780 + }, + { + "epoch": 0.4359310637826256, + "grad_norm": 1.5068996818021825, + "learning_rate": 3.206931141126622e-07, + "loss": 1.6353, + "step": 11800 + }, + { + "epoch": 0.4366699299924266, + "grad_norm": 1.5980487695346257, + "learning_rate": 3.2018159139253667e-07, + "loss": 1.6442, + "step": 11820 + }, + { + "epoch": 0.43740879620222767, + "grad_norm": 1.9446682447819341, + "learning_rate": 3.1966982522299927e-07, + "loss": 1.6215, + "step": 11840 + }, + { + "epoch": 0.43814766241202874, + "grad_norm": 1.3911283325778476, + "learning_rate": 3.1915781836157076e-07, + "loss": 1.6237, + "step": 11860 + }, + { + "epoch": 0.4388865286218298, + "grad_norm": 1.7379788181506113, + "learning_rate": 3.1864557356706854e-07, + "loss": 1.6311, + "step": 11880 + }, + { + "epoch": 0.4396253948316309, + "grad_norm": 1.5960691894661032, + "learning_rate": 3.181330935995925e-07, + "loss": 1.6967, + "step": 11900 + }, + { + "epoch": 0.44036426104143195, + "grad_norm": 1.334622875404918, + "learning_rate": 3.176203812205092e-07, + "loss": 1.7151, + "step": 11920 + }, + { + "epoch": 0.44110312725123296, + "grad_norm": 2.3408851593313287, + "learning_rate": 3.171074391924379e-07, + "loss": 1.6204, + "step": 11940 + }, + { + "epoch": 0.44184199346103403, + "grad_norm": 1.517416691835459, + "learning_rate": 3.16594270279235e-07, + "loss": 1.647, + "step": 11960 + }, + { + "epoch": 0.4425808596708351, + "grad_norm": 1.732092967222855, + "learning_rate": 3.160808772459796e-07, + "loss": 1.6246, + "step": 11980 + }, + { + "epoch": 0.4433197258806362, + "grad_norm": 1.4748895033828555, + "learning_rate": 3.155672628589582e-07, + "loss": 1.6559, + "step": 12000 + }, + { + "epoch": 0.44405859209043724, + "grad_norm": 1.466688995230755, + "learning_rate": 3.1505342988565024e-07, + "loss": 1.6631, + "step": 12020 + }, + { + "epoch": 0.44479745830023826, + "grad_norm": 1.5762348950247518, + "learning_rate": 3.145393810947129e-07, + "loss": 1.6507, + "step": 12040 + }, + { + "epoch": 0.4455363245100393, + "grad_norm": 1.5705066014221254, + "learning_rate": 3.1402511925596604e-07, + "loss": 1.6218, + "step": 12060 + }, + { + "epoch": 0.4462751907198404, + "grad_norm": 1.5033544192166477, + "learning_rate": 3.135106471403778e-07, + "loss": 1.6645, + "step": 12080 + }, + { + "epoch": 0.44701405692964147, + "grad_norm": 1.8660368037827004, + "learning_rate": 3.1299596752004884e-07, + "loss": 1.6617, + "step": 12100 + }, + { + "epoch": 0.44775292313944254, + "grad_norm": 1.6278625709035912, + "learning_rate": 3.124810831681987e-07, + "loss": 1.6383, + "step": 12120 + }, + { + "epoch": 0.4484917893492436, + "grad_norm": 1.6698134882051106, + "learning_rate": 3.1196599685914916e-07, + "loss": 1.6691, + "step": 12140 + }, + { + "epoch": 0.4492306555590446, + "grad_norm": 1.5877476217951574, + "learning_rate": 3.114507113683109e-07, + "loss": 1.6091, + "step": 12160 + }, + { + "epoch": 0.4499695217688457, + "grad_norm": 1.533714449161249, + "learning_rate": 3.109352294721674e-07, + "loss": 1.6721, + "step": 12180 + }, + { + "epoch": 0.45070838797864676, + "grad_norm": 1.415779061176635, + "learning_rate": 3.104195539482607e-07, + "loss": 1.606, + "step": 12200 + }, + { + "epoch": 0.45144725418844783, + "grad_norm": 1.4338589085273825, + "learning_rate": 3.0990368757517605e-07, + "loss": 1.6661, + "step": 12220 + }, + { + "epoch": 0.4521861203982489, + "grad_norm": 1.8998339669584823, + "learning_rate": 3.093876331325269e-07, + "loss": 1.609, + "step": 12240 + }, + { + "epoch": 0.45292498660805, + "grad_norm": 1.384458068102408, + "learning_rate": 3.0889720974519455e-07, + "loss": 1.6454, + "step": 12260 + }, + { + "epoch": 0.453663852817851, + "grad_norm": 1.4452081009096462, + "learning_rate": 3.083807965655827e-07, + "loss": 1.6452, + "step": 12280 + }, + { + "epoch": 0.45440271902765206, + "grad_norm": 1.5698647385968285, + "learning_rate": 3.0786420352211376e-07, + "loss": 1.6741, + "step": 12300 + }, + { + "epoch": 0.45514158523745313, + "grad_norm": 1.9552580205602894, + "learning_rate": 3.0734743339831694e-07, + "loss": 1.6845, + "step": 12320 + }, + { + "epoch": 0.4558804514472542, + "grad_norm": 1.3583889408096808, + "learning_rate": 3.068304889786754e-07, + "loss": 1.6744, + "step": 12340 + }, + { + "epoch": 0.45661931765705527, + "grad_norm": 1.6780668319449847, + "learning_rate": 3.063133730486116e-07, + "loss": 1.6258, + "step": 12360 + }, + { + "epoch": 0.4573581838668563, + "grad_norm": 1.627173946323959, + "learning_rate": 3.057960883944719e-07, + "loss": 1.6198, + "step": 12380 + }, + { + "epoch": 0.45809705007665735, + "grad_norm": 1.3800453841054778, + "learning_rate": 3.0527863780351194e-07, + "loss": 1.6268, + "step": 12400 + }, + { + "epoch": 0.4588359162864584, + "grad_norm": 1.5516028071383072, + "learning_rate": 3.047610240638816e-07, + "loss": 1.679, + "step": 12420 + }, + { + "epoch": 0.4595747824962595, + "grad_norm": 1.546230302013408, + "learning_rate": 3.0424324996460955e-07, + "loss": 1.6234, + "step": 12440 + }, + { + "epoch": 0.46031364870606056, + "grad_norm": 1.5739393391599368, + "learning_rate": 3.037253182955887e-07, + "loss": 1.703, + "step": 12460 + }, + { + "epoch": 0.46105251491586163, + "grad_norm": 1.5792552039289542, + "learning_rate": 3.0320723184756095e-07, + "loss": 1.6453, + "step": 12480 + }, + { + "epoch": 0.46179138112566265, + "grad_norm": 1.5239329095833032, + "learning_rate": 3.026889934121023e-07, + "loss": 1.6553, + "step": 12500 + }, + { + "epoch": 0.4625302473354637, + "grad_norm": 1.4558048272931619, + "learning_rate": 3.021706057816074e-07, + "loss": 1.6563, + "step": 12520 + }, + { + "epoch": 0.4632691135452648, + "grad_norm": 1.5801820167249694, + "learning_rate": 3.0165207174927513e-07, + "loss": 1.6645, + "step": 12540 + }, + { + "epoch": 0.46400797975506586, + "grad_norm": 1.5560547577828236, + "learning_rate": 3.01133394109093e-07, + "loss": 1.6596, + "step": 12560 + }, + { + "epoch": 0.46474684596486693, + "grad_norm": 1.6818881647492323, + "learning_rate": 3.006145756558223e-07, + "loss": 1.6335, + "step": 12580 + }, + { + "epoch": 0.465485712174668, + "grad_norm": 1.6120666995517767, + "learning_rate": 3.0009561918498335e-07, + "loss": 1.6685, + "step": 12600 + }, + { + "epoch": 0.466224578384469, + "grad_norm": 1.4949729602626867, + "learning_rate": 2.995765274928398e-07, + "loss": 1.6753, + "step": 12620 + }, + { + "epoch": 0.4669634445942701, + "grad_norm": 1.5289962949889762, + "learning_rate": 2.9905730337638395e-07, + "loss": 1.6548, + "step": 12640 + }, + { + "epoch": 0.46770231080407115, + "grad_norm": 1.8299373423521412, + "learning_rate": 2.98537949633322e-07, + "loss": 1.5999, + "step": 12660 + }, + { + "epoch": 0.4684411770138722, + "grad_norm": 1.5948007806430553, + "learning_rate": 2.9801846906205794e-07, + "loss": 1.6638, + "step": 12680 + }, + { + "epoch": 0.4691800432236733, + "grad_norm": 1.418583561219425, + "learning_rate": 2.974988644616799e-07, + "loss": 1.6782, + "step": 12700 + }, + { + "epoch": 0.4699189094334743, + "grad_norm": 1.461318006445296, + "learning_rate": 2.9700512775939907e-07, + "loss": 1.6528, + "step": 12720 + }, + { + "epoch": 0.4706577756432754, + "grad_norm": 1.5468327583259127, + "learning_rate": 2.964852893556419e-07, + "loss": 1.6685, + "step": 12740 + }, + { + "epoch": 0.47139664185307645, + "grad_norm": 1.6470459204833447, + "learning_rate": 2.9596533518391615e-07, + "loss": 1.6733, + "step": 12760 + }, + { + "epoch": 0.4721355080628775, + "grad_norm": 1.624503313092944, + "learning_rate": 2.954452680458612e-07, + "loss": 1.6737, + "step": 12780 + }, + { + "epoch": 0.4728743742726786, + "grad_norm": 1.5728828027087576, + "learning_rate": 2.949250907437256e-07, + "loss": 1.6671, + "step": 12800 + }, + { + "epoch": 0.47361324048247966, + "grad_norm": 1.679151732155206, + "learning_rate": 2.944048060803512e-07, + "loss": 1.656, + "step": 12820 + }, + { + "epoch": 0.4743521066922807, + "grad_norm": 1.4259988112675113, + "learning_rate": 2.938844168591584e-07, + "loss": 1.6088, + "step": 12840 + }, + { + "epoch": 0.47509097290208174, + "grad_norm": 2.10422922646524, + "learning_rate": 2.933639258841309e-07, + "loss": 1.6411, + "step": 12860 + }, + { + "epoch": 0.4758298391118828, + "grad_norm": 1.809412517307293, + "learning_rate": 2.92843335959801e-07, + "loss": 1.654, + "step": 12880 + }, + { + "epoch": 0.4765687053216839, + "grad_norm": 1.6010915209622532, + "learning_rate": 2.923226498912336e-07, + "loss": 1.6653, + "step": 12900 + }, + { + "epoch": 0.47730757153148495, + "grad_norm": 1.7399335136485357, + "learning_rate": 2.918018704840123e-07, + "loss": 1.6839, + "step": 12920 + }, + { + "epoch": 0.478046437741286, + "grad_norm": 1.9845153410774579, + "learning_rate": 2.912810005442231e-07, + "loss": 1.6308, + "step": 12940 + }, + { + "epoch": 0.47878530395108704, + "grad_norm": 1.4672730941447367, + "learning_rate": 2.9076004287844007e-07, + "loss": 1.7158, + "step": 12960 + }, + { + "epoch": 0.4795241701608881, + "grad_norm": 1.3537458462825016, + "learning_rate": 2.9023900029371e-07, + "loss": 1.5888, + "step": 12980 + }, + { + "epoch": 0.4802630363706892, + "grad_norm": 1.585460577335508, + "learning_rate": 2.8971787559753695e-07, + "loss": 1.6476, + "step": 13000 + }, + { + "epoch": 0.48100190258049025, + "grad_norm": 1.561928549919643, + "learning_rate": 2.891966715978679e-07, + "loss": 1.6339, + "step": 13020 + }, + { + "epoch": 0.4817407687902913, + "grad_norm": 1.439464952580829, + "learning_rate": 2.886753911030767e-07, + "loss": 1.6619, + "step": 13040 + }, + { + "epoch": 0.48247963500009233, + "grad_norm": 1.5693967956885457, + "learning_rate": 2.8815403692194954e-07, + "loss": 1.6443, + "step": 13060 + }, + { + "epoch": 0.4832185012098934, + "grad_norm": 1.8445144793183739, + "learning_rate": 2.8763261186366977e-07, + "loss": 1.6395, + "step": 13080 + }, + { + "epoch": 0.4839573674196945, + "grad_norm": 1.4215590880054088, + "learning_rate": 2.8711111873780224e-07, + "loss": 1.6583, + "step": 13100 + }, + { + "epoch": 0.48469623362949554, + "grad_norm": 1.6129407222161285, + "learning_rate": 2.8658956035427917e-07, + "loss": 1.6579, + "step": 13120 + }, + { + "epoch": 0.4854350998392966, + "grad_norm": 1.7787904262576621, + "learning_rate": 2.8606793952338394e-07, + "loss": 1.6387, + "step": 13140 + }, + { + "epoch": 0.4861739660490977, + "grad_norm": 2.9317837538381384, + "learning_rate": 2.8554625905573646e-07, + "loss": 1.6258, + "step": 13160 + }, + { + "epoch": 0.4869128322588987, + "grad_norm": 1.6449106895888608, + "learning_rate": 2.850245217622784e-07, + "loss": 1.6492, + "step": 13180 + }, + { + "epoch": 0.48765169846869977, + "grad_norm": 1.5321621721627146, + "learning_rate": 2.8450273045425677e-07, + "loss": 1.6456, + "step": 13200 + }, + { + "epoch": 0.48839056467850084, + "grad_norm": 1.5327848701302575, + "learning_rate": 2.8398088794321054e-07, + "loss": 1.6299, + "step": 13220 + }, + { + "epoch": 0.4891294308883019, + "grad_norm": 1.5262317315528862, + "learning_rate": 2.8345899704095424e-07, + "loss": 1.6815, + "step": 13240 + }, + { + "epoch": 0.489868297098103, + "grad_norm": 8.056093277940944, + "learning_rate": 2.8293706055956266e-07, + "loss": 1.6196, + "step": 13260 + }, + { + "epoch": 0.49060716330790405, + "grad_norm": 1.7903474479157373, + "learning_rate": 2.8241508131135704e-07, + "loss": 1.6748, + "step": 13280 + }, + { + "epoch": 0.49134602951770506, + "grad_norm": 2.3280755640085857, + "learning_rate": 2.818930621088883e-07, + "loss": 1.674, + "step": 13300 + }, + { + "epoch": 0.49208489572750613, + "grad_norm": 1.7132266058410768, + "learning_rate": 2.8137100576492324e-07, + "loss": 1.6407, + "step": 13320 + }, + { + "epoch": 0.4928237619373072, + "grad_norm": 1.652779406776925, + "learning_rate": 2.808489150924283e-07, + "loss": 1.6672, + "step": 13340 + }, + { + "epoch": 0.4935626281471083, + "grad_norm": 1.597072673714322, + "learning_rate": 2.8032679290455525e-07, + "loss": 1.6326, + "step": 13360 + }, + { + "epoch": 0.49430149435690934, + "grad_norm": 1.483890002284729, + "learning_rate": 2.798046420146254e-07, + "loss": 1.6953, + "step": 13380 + }, + { + "epoch": 0.49504036056671036, + "grad_norm": 1.5673926854706393, + "learning_rate": 2.792824652361149e-07, + "loss": 1.6348, + "step": 13400 + }, + { + "epoch": 0.49577922677651143, + "grad_norm": 1.3752789014048936, + "learning_rate": 2.7876026538263935e-07, + "loss": 1.6333, + "step": 13420 + }, + { + "epoch": 0.4965180929863125, + "grad_norm": 1.439519752453901, + "learning_rate": 2.7823804526793863e-07, + "loss": 1.6322, + "step": 13440 + }, + { + "epoch": 0.49725695919611357, + "grad_norm": 1.6858659909371638, + "learning_rate": 2.777158077058619e-07, + "loss": 1.6087, + "step": 13460 + }, + { + "epoch": 0.49799582540591464, + "grad_norm": 1.475020677300443, + "learning_rate": 2.771935555103521e-07, + "loss": 1.6085, + "step": 13480 + }, + { + "epoch": 0.4987346916157157, + "grad_norm": 1.5498271971579036, + "learning_rate": 2.766712914954314e-07, + "loss": 1.6546, + "step": 13500 + }, + { + "epoch": 0.4994735578255167, + "grad_norm": 2.096090843883931, + "learning_rate": 2.7614901847518525e-07, + "loss": 1.6812, + "step": 13520 + }, + { + "epoch": 0.5002124240353178, + "grad_norm": 1.4457832913454574, + "learning_rate": 2.756267392637479e-07, + "loss": 1.6581, + "step": 13540 + }, + { + "epoch": 0.5009512902451189, + "grad_norm": 2.01817520318154, + "learning_rate": 2.751044566752869e-07, + "loss": 1.6615, + "step": 13560 + }, + { + "epoch": 0.5016901564549199, + "grad_norm": 1.4227402127659055, + "learning_rate": 2.745821735239878e-07, + "loss": 1.6324, + "step": 13580 + }, + { + "epoch": 0.502429022664721, + "grad_norm": 1.8405513240063371, + "learning_rate": 2.7405989262403955e-07, + "loss": 1.6698, + "step": 13600 + }, + { + "epoch": 0.503167888874522, + "grad_norm": 1.4788179775173926, + "learning_rate": 2.7353761678961865e-07, + "loss": 1.6359, + "step": 13620 + }, + { + "epoch": 0.5039067550843231, + "grad_norm": 1.7223731354636942, + "learning_rate": 2.730153488348744e-07, + "loss": 1.6306, + "step": 13640 + }, + { + "epoch": 0.5046456212941242, + "grad_norm": 2.5321925077821406, + "learning_rate": 2.724930915739137e-07, + "loss": 1.6752, + "step": 13660 + }, + { + "epoch": 0.5053844875039252, + "grad_norm": 1.5208216957527443, + "learning_rate": 2.7197084782078585e-07, + "loss": 1.6439, + "step": 13680 + }, + { + "epoch": 0.5061233537137263, + "grad_norm": 1.4323741561095633, + "learning_rate": 2.7144862038946716e-07, + "loss": 1.644, + "step": 13700 + }, + { + "epoch": 0.5068622199235273, + "grad_norm": 1.426194444263622, + "learning_rate": 2.709264120938464e-07, + "loss": 1.6383, + "step": 13720 + }, + { + "epoch": 0.5076010861333284, + "grad_norm": 1.9190094996790648, + "learning_rate": 2.7040422574770866e-07, + "loss": 1.6015, + "step": 13740 + }, + { + "epoch": 0.5083399523431295, + "grad_norm": 1.5070566631142777, + "learning_rate": 2.698820641647212e-07, + "loss": 1.6841, + "step": 13760 + }, + { + "epoch": 0.5090788185529306, + "grad_norm": 1.9970969408548236, + "learning_rate": 2.693599301584179e-07, + "loss": 1.6346, + "step": 13780 + }, + { + "epoch": 0.5098176847627316, + "grad_norm": 1.683784538174349, + "learning_rate": 2.688378265421837e-07, + "loss": 1.6829, + "step": 13800 + }, + { + "epoch": 0.5105565509725326, + "grad_norm": 1.7421711729558282, + "learning_rate": 2.683157561292399e-07, + "loss": 1.626, + "step": 13820 + }, + { + "epoch": 0.5112954171823337, + "grad_norm": 1.6638975974760875, + "learning_rate": 2.6779372173262917e-07, + "loss": 1.6847, + "step": 13840 + }, + { + "epoch": 0.5120342833921347, + "grad_norm": 1.7300243765637946, + "learning_rate": 2.672717261651998e-07, + "loss": 1.6635, + "step": 13860 + }, + { + "epoch": 0.5127731496019359, + "grad_norm": 1.7350443481000342, + "learning_rate": 2.667497722395909e-07, + "loss": 1.6648, + "step": 13880 + }, + { + "epoch": 0.5135120158117369, + "grad_norm": 1.8257677624748465, + "learning_rate": 2.662278627682172e-07, + "loss": 1.642, + "step": 13900 + }, + { + "epoch": 0.5142508820215379, + "grad_norm": 1.7828372493231617, + "learning_rate": 2.657060005632543e-07, + "loss": 1.6354, + "step": 13920 + }, + { + "epoch": 0.514989748231339, + "grad_norm": 1.4463498826235905, + "learning_rate": 2.6518418843662256e-07, + "loss": 1.6342, + "step": 13940 + }, + { + "epoch": 0.51572861444114, + "grad_norm": 1.5876083742799603, + "learning_rate": 2.6466242919997263e-07, + "loss": 1.6541, + "step": 13960 + }, + { + "epoch": 0.5164674806509412, + "grad_norm": 1.4658443332943762, + "learning_rate": 2.641407256646705e-07, + "loss": 1.6865, + "step": 13980 + }, + { + "epoch": 0.5172063468607422, + "grad_norm": 1.3991873689568013, + "learning_rate": 2.636190806417817e-07, + "loss": 1.6322, + "step": 14000 + }, + { + "epoch": 0.5179452130705432, + "grad_norm": 2.1443694620412823, + "learning_rate": 2.6309749694205643e-07, + "loss": 1.6337, + "step": 14020 + }, + { + "epoch": 0.5186840792803443, + "grad_norm": 1.8812922050974208, + "learning_rate": 2.6257597737591484e-07, + "loss": 1.6003, + "step": 14040 + }, + { + "epoch": 0.5194229454901453, + "grad_norm": 1.4849904179267404, + "learning_rate": 2.6205452475343135e-07, + "loss": 1.6554, + "step": 14060 + }, + { + "epoch": 0.5201618116999465, + "grad_norm": 1.5710794059095268, + "learning_rate": 2.6153314188431934e-07, + "loss": 1.6585, + "step": 14080 + }, + { + "epoch": 0.5209006779097475, + "grad_norm": 1.4300979250373247, + "learning_rate": 2.6101183157791687e-07, + "loss": 1.6266, + "step": 14100 + }, + { + "epoch": 0.5216395441195486, + "grad_norm": 1.4201641845366786, + "learning_rate": 2.604905966431707e-07, + "loss": 1.6278, + "step": 14120 + }, + { + "epoch": 0.5223784103293496, + "grad_norm": 1.4634294685934828, + "learning_rate": 2.5996943988862136e-07, + "loss": 1.6575, + "step": 14140 + }, + { + "epoch": 0.5231172765391506, + "grad_norm": 1.5428372121996694, + "learning_rate": 2.594483641223885e-07, + "loss": 1.6751, + "step": 14160 + }, + { + "epoch": 0.5238561427489518, + "grad_norm": 1.738164845435304, + "learning_rate": 2.5892737215215507e-07, + "loss": 1.6492, + "step": 14180 + }, + { + "epoch": 0.5245950089587528, + "grad_norm": 1.5256411770058975, + "learning_rate": 2.584064667851527e-07, + "loss": 1.6491, + "step": 14200 + }, + { + "epoch": 0.5253338751685539, + "grad_norm": 2.0408240630415513, + "learning_rate": 2.578856508281461e-07, + "loss": 1.6424, + "step": 14220 + }, + { + "epoch": 0.5260727413783549, + "grad_norm": 1.5107852579348091, + "learning_rate": 2.573649270874187e-07, + "loss": 1.6575, + "step": 14240 + }, + { + "epoch": 0.5268116075881559, + "grad_norm": 1.606923866961281, + "learning_rate": 2.568442983687567e-07, + "loss": 1.6678, + "step": 14260 + }, + { + "epoch": 0.527550473797957, + "grad_norm": 1.86036331527246, + "learning_rate": 2.5632376747743416e-07, + "loss": 1.6611, + "step": 14280 + }, + { + "epoch": 0.5282893400077581, + "grad_norm": 1.6282520348397496, + "learning_rate": 2.5580333721819837e-07, + "loss": 1.6887, + "step": 14300 + }, + { + "epoch": 0.5290282062175592, + "grad_norm": 1.4902965967534727, + "learning_rate": 2.5528301039525427e-07, + "loss": 1.673, + "step": 14320 + }, + { + "epoch": 0.5297670724273602, + "grad_norm": 2.9289521410401607, + "learning_rate": 2.547627898122493e-07, + "loss": 1.618, + "step": 14340 + }, + { + "epoch": 0.5305059386371612, + "grad_norm": 1.5801255890460382, + "learning_rate": 2.5424267827225884e-07, + "loss": 1.6478, + "step": 14360 + }, + { + "epoch": 0.5312448048469623, + "grad_norm": 1.904222753922445, + "learning_rate": 2.5372267857777017e-07, + "loss": 1.6543, + "step": 14380 + }, + { + "epoch": 0.5319836710567634, + "grad_norm": 1.5136725876022765, + "learning_rate": 2.532027935306684e-07, + "loss": 1.658, + "step": 14400 + }, + { + "epoch": 0.5327225372665645, + "grad_norm": 1.8648484080963088, + "learning_rate": 2.5268302593222056e-07, + "loss": 1.6279, + "step": 14420 + }, + { + "epoch": 0.5334614034763655, + "grad_norm": 1.4732933175166334, + "learning_rate": 2.521633785830612e-07, + "loss": 1.6535, + "step": 14440 + }, + { + "epoch": 0.5342002696861666, + "grad_norm": 1.7964137810644547, + "learning_rate": 2.5164385428317656e-07, + "loss": 1.6291, + "step": 14460 + }, + { + "epoch": 0.5349391358959676, + "grad_norm": 1.7384258178088878, + "learning_rate": 2.5112445583189e-07, + "loss": 1.6484, + "step": 14480 + }, + { + "epoch": 0.5356780021057687, + "grad_norm": 1.6118844731600752, + "learning_rate": 2.506051860278469e-07, + "loss": 1.6461, + "step": 14500 + }, + { + "epoch": 0.5364168683155698, + "grad_norm": 1.612441861147252, + "learning_rate": 2.500860476689993e-07, + "loss": 1.6368, + "step": 14520 + }, + { + "epoch": 0.5371557345253708, + "grad_norm": 1.4719276982885592, + "learning_rate": 2.4956704355259106e-07, + "loss": 1.616, + "step": 14540 + }, + { + "epoch": 0.5378946007351719, + "grad_norm": 1.4849285106056183, + "learning_rate": 2.4904817647514273e-07, + "loss": 1.6467, + "step": 14560 + }, + { + "epoch": 0.5386334669449729, + "grad_norm": 2.0929018106610533, + "learning_rate": 2.485294492324364e-07, + "loss": 1.6517, + "step": 14580 + }, + { + "epoch": 0.539372333154774, + "grad_norm": 1.3910097740422103, + "learning_rate": 2.480108646195006e-07, + "loss": 1.6319, + "step": 14600 + }, + { + "epoch": 0.5401111993645751, + "grad_norm": 1.8158803135234147, + "learning_rate": 2.474924254305956e-07, + "loss": 1.6902, + "step": 14620 + }, + { + "epoch": 0.5408500655743761, + "grad_norm": 1.6514040636762424, + "learning_rate": 2.4697413445919785e-07, + "loss": 1.6479, + "step": 14640 + }, + { + "epoch": 0.5415889317841772, + "grad_norm": 1.5739603939688216, + "learning_rate": 2.4645599449798536e-07, + "loss": 1.639, + "step": 14660 + }, + { + "epoch": 0.5423277979939782, + "grad_norm": 1.5178753830207266, + "learning_rate": 2.459380083388221e-07, + "loss": 1.6235, + "step": 14680 + }, + { + "epoch": 0.5430666642037792, + "grad_norm": 1.52558838171546, + "learning_rate": 2.4542017877274397e-07, + "loss": 1.6835, + "step": 14700 + }, + { + "epoch": 0.5438055304135804, + "grad_norm": 2.2408509501139533, + "learning_rate": 2.4490250858994243e-07, + "loss": 1.5869, + "step": 14720 + }, + { + "epoch": 0.5445443966233814, + "grad_norm": 1.6053244248684069, + "learning_rate": 2.4438500057975043e-07, + "loss": 1.6698, + "step": 14740 + }, + { + "epoch": 0.5452832628331825, + "grad_norm": 1.4975811830975623, + "learning_rate": 2.4386765753062733e-07, + "loss": 1.6337, + "step": 14760 + }, + { + "epoch": 0.5460221290429835, + "grad_norm": 1.4849817547603397, + "learning_rate": 2.4335048223014316e-07, + "loss": 1.6095, + "step": 14780 + }, + { + "epoch": 0.5467609952527847, + "grad_norm": 1.8454272427613772, + "learning_rate": 2.4283347746496436e-07, + "loss": 1.6191, + "step": 14800 + }, + { + "epoch": 0.5474998614625857, + "grad_norm": 1.484721990845683, + "learning_rate": 2.4231664602083857e-07, + "loss": 1.6156, + "step": 14820 + }, + { + "epoch": 0.5482387276723867, + "grad_norm": 1.4970531164331227, + "learning_rate": 2.4179999068257935e-07, + "loss": 1.6903, + "step": 14840 + }, + { + "epoch": 0.5489775938821878, + "grad_norm": 1.60919652354879, + "learning_rate": 2.412835142340513e-07, + "loss": 1.6813, + "step": 14860 + }, + { + "epoch": 0.5497164600919888, + "grad_norm": 1.3606018353206684, + "learning_rate": 2.4076721945815544e-07, + "loss": 1.6769, + "step": 14880 + }, + { + "epoch": 0.55045532630179, + "grad_norm": 1.458693168765768, + "learning_rate": 2.4025110913681355e-07, + "loss": 1.6373, + "step": 14900 + }, + { + "epoch": 0.551194192511591, + "grad_norm": 1.547291419668359, + "learning_rate": 2.397351860509537e-07, + "loss": 1.6525, + "step": 14920 + }, + { + "epoch": 0.551933058721392, + "grad_norm": 1.7224542921095407, + "learning_rate": 2.392194529804951e-07, + "loss": 1.6761, + "step": 14940 + }, + { + "epoch": 0.5526719249311931, + "grad_norm": 1.6677249547234672, + "learning_rate": 2.38703912704333e-07, + "loss": 1.625, + "step": 14960 + }, + { + "epoch": 0.5534107911409941, + "grad_norm": 1.4519952098563818, + "learning_rate": 2.3818856800032395e-07, + "loss": 1.6244, + "step": 14980 + }, + { + "epoch": 0.5541496573507952, + "grad_norm": 1.7967122495859562, + "learning_rate": 2.3767342164527055e-07, + "loss": 1.6719, + "step": 15000 + }, + { + "epoch": 0.5548885235605963, + "grad_norm": 1.3751693238795433, + "learning_rate": 2.3715847641490688e-07, + "loss": 1.6397, + "step": 15020 + }, + { + "epoch": 0.5556273897703973, + "grad_norm": 1.5461207825297583, + "learning_rate": 2.3664373508388318e-07, + "loss": 1.6871, + "step": 15040 + }, + { + "epoch": 0.5563662559801984, + "grad_norm": 1.3729095610665938, + "learning_rate": 2.3612920042575091e-07, + "loss": 1.6568, + "step": 15060 + }, + { + "epoch": 0.5571051221899994, + "grad_norm": 1.5955595428086877, + "learning_rate": 2.3561487521294814e-07, + "loss": 1.6439, + "step": 15080 + }, + { + "epoch": 0.5578439883998005, + "grad_norm": 1.505255489966295, + "learning_rate": 2.351007622167843e-07, + "loss": 1.6114, + "step": 15100 + }, + { + "epoch": 0.5585828546096016, + "grad_norm": 1.4629681148522744, + "learning_rate": 2.3458686420742528e-07, + "loss": 1.6114, + "step": 15120 + }, + { + "epoch": 0.5593217208194027, + "grad_norm": 1.7359961722060924, + "learning_rate": 2.3407318395387875e-07, + "loss": 1.6416, + "step": 15140 + }, + { + "epoch": 0.5600605870292037, + "grad_norm": 1.6390324621472498, + "learning_rate": 2.3355972422397895e-07, + "loss": 1.6625, + "step": 15160 + }, + { + "epoch": 0.5607994532390047, + "grad_norm": 1.7925619507510513, + "learning_rate": 2.3304648778437175e-07, + "loss": 1.6822, + "step": 15180 + }, + { + "epoch": 0.5615383194488058, + "grad_norm": 1.6256712121515025, + "learning_rate": 2.3253347740050012e-07, + "loss": 1.6793, + "step": 15200 + }, + { + "epoch": 0.5622771856586068, + "grad_norm": 1.6887168187109596, + "learning_rate": 2.3202069583658883e-07, + "loss": 1.6403, + "step": 15220 + }, + { + "epoch": 0.563016051868408, + "grad_norm": 1.4622893380793243, + "learning_rate": 2.3150814585562984e-07, + "loss": 1.6256, + "step": 15240 + }, + { + "epoch": 0.563754918078209, + "grad_norm": 1.720681049824639, + "learning_rate": 2.3099583021936703e-07, + "loss": 1.6331, + "step": 15260 + }, + { + "epoch": 0.56449378428801, + "grad_norm": 1.6844323896773028, + "learning_rate": 2.3048375168828194e-07, + "loss": 1.6249, + "step": 15280 + }, + { + "epoch": 0.5652326504978111, + "grad_norm": 1.4304416297000766, + "learning_rate": 2.2997191302157831e-07, + "loss": 1.6476, + "step": 15300 + }, + { + "epoch": 0.5659715167076121, + "grad_norm": 2.6747036703519966, + "learning_rate": 2.2946031697716728e-07, + "loss": 1.6704, + "step": 15320 + }, + { + "epoch": 0.5667103829174133, + "grad_norm": 1.8934913018327109, + "learning_rate": 2.2894896631165312e-07, + "loss": 1.6557, + "step": 15340 + }, + { + "epoch": 0.5674492491272143, + "grad_norm": 1.5864443521535418, + "learning_rate": 2.2843786378031749e-07, + "loss": 1.6111, + "step": 15360 + }, + { + "epoch": 0.5681881153370153, + "grad_norm": 1.6147764207744268, + "learning_rate": 2.279270121371053e-07, + "loss": 1.6617, + "step": 15380 + }, + { + "epoch": 0.5689269815468164, + "grad_norm": 1.5889401903281988, + "learning_rate": 2.274164141346096e-07, + "loss": 1.6472, + "step": 15400 + }, + { + "epoch": 0.5696658477566174, + "grad_norm": 1.8322046948313095, + "learning_rate": 2.2690607252405664e-07, + "loss": 1.681, + "step": 15420 + }, + { + "epoch": 0.5704047139664186, + "grad_norm": 1.319095874026253, + "learning_rate": 2.2639599005529124e-07, + "loss": 1.6339, + "step": 15440 + }, + { + "epoch": 0.5711435801762196, + "grad_norm": 1.568413450074265, + "learning_rate": 2.258861694767619e-07, + "loss": 1.6385, + "step": 15460 + }, + { + "epoch": 0.5718824463860207, + "grad_norm": 1.659163649600049, + "learning_rate": 2.2537661353550603e-07, + "loss": 1.6292, + "step": 15480 + }, + { + "epoch": 0.5726213125958217, + "grad_norm": 1.484851792665619, + "learning_rate": 2.2486732497713507e-07, + "loss": 1.6887, + "step": 15500 + }, + { + "epoch": 0.5733601788056227, + "grad_norm": 1.609907878598695, + "learning_rate": 2.2435830654581962e-07, + "loss": 1.6266, + "step": 15520 + }, + { + "epoch": 0.5740990450154239, + "grad_norm": 1.4453575034227937, + "learning_rate": 2.2387499173937125e-07, + "loss": 1.6537, + "step": 15540 + }, + { + "epoch": 0.5748379112252249, + "grad_norm": 1.7710876217433056, + "learning_rate": 2.2336650794320994e-07, + "loss": 1.6588, + "step": 15560 + }, + { + "epoch": 0.575576777435026, + "grad_norm": 1.4085011499137292, + "learning_rate": 2.2285830236087167e-07, + "loss": 1.6293, + "step": 15580 + }, + { + "epoch": 0.576315643644827, + "grad_norm": 1.4053148152524308, + "learning_rate": 2.2235037773069188e-07, + "loss": 1.629, + "step": 15600 + }, + { + "epoch": 0.577054509854628, + "grad_norm": 1.456136317052379, + "learning_rate": 2.2184273678949212e-07, + "loss": 1.6448, + "step": 15620 + }, + { + "epoch": 0.5777933760644292, + "grad_norm": 1.5709035364905237, + "learning_rate": 2.213353822725652e-07, + "loss": 1.6556, + "step": 15640 + }, + { + "epoch": 0.5785322422742302, + "grad_norm": 2.381482655936729, + "learning_rate": 2.2082831691366104e-07, + "loss": 1.6298, + "step": 15660 + }, + { + "epoch": 0.5792711084840313, + "grad_norm": 1.510088899026219, + "learning_rate": 2.2032154344497096e-07, + "loss": 1.69, + "step": 15680 + }, + { + "epoch": 0.5800099746938323, + "grad_norm": 1.4208293328335637, + "learning_rate": 2.198150645971138e-07, + "loss": 1.6533, + "step": 15700 + }, + { + "epoch": 0.5807488409036333, + "grad_norm": 1.5394108559637645, + "learning_rate": 2.1930888309912098e-07, + "loss": 1.6145, + "step": 15720 + }, + { + "epoch": 0.5814877071134344, + "grad_norm": 1.8494498268185677, + "learning_rate": 2.188030016784216e-07, + "loss": 1.6262, + "step": 15740 + }, + { + "epoch": 0.5822265733232355, + "grad_norm": 2.390942191221342, + "learning_rate": 2.1829742306082778e-07, + "loss": 1.612, + "step": 15760 + }, + { + "epoch": 0.5829654395330366, + "grad_norm": 2.4364332149226446, + "learning_rate": 2.1779214997052025e-07, + "loss": 1.6548, + "step": 15780 + }, + { + "epoch": 0.5837043057428376, + "grad_norm": 1.7161768355514782, + "learning_rate": 2.1728718513003342e-07, + "loss": 1.6822, + "step": 15800 + }, + { + "epoch": 0.5844431719526387, + "grad_norm": 1.6209379371159418, + "learning_rate": 2.1678253126024072e-07, + "loss": 1.6068, + "step": 15820 + }, + { + "epoch": 0.5851820381624397, + "grad_norm": 2.1623351366291725, + "learning_rate": 2.1627819108034002e-07, + "loss": 1.6138, + "step": 15840 + }, + { + "epoch": 0.5859209043722408, + "grad_norm": 1.3848518910214123, + "learning_rate": 2.1577416730783904e-07, + "loss": 1.6315, + "step": 15860 + }, + { + "epoch": 0.5866597705820419, + "grad_norm": 1.377598599479366, + "learning_rate": 2.1527046265854049e-07, + "loss": 1.6263, + "step": 15880 + }, + { + "epoch": 0.5873986367918429, + "grad_norm": 1.5951258889353628, + "learning_rate": 2.1476707984652764e-07, + "loss": 1.6442, + "step": 15900 + }, + { + "epoch": 0.588137503001644, + "grad_norm": 1.4119428291190372, + "learning_rate": 2.1426402158414964e-07, + "loss": 1.6776, + "step": 15920 + }, + { + "epoch": 0.588876369211445, + "grad_norm": 1.5401792838637114, + "learning_rate": 2.1376129058200687e-07, + "loss": 1.6489, + "step": 15940 + }, + { + "epoch": 0.589615235421246, + "grad_norm": 1.603780373356476, + "learning_rate": 2.1325888954893618e-07, + "loss": 1.6525, + "step": 15960 + }, + { + "epoch": 0.5903541016310472, + "grad_norm": 1.5200619012123444, + "learning_rate": 2.1275682119199674e-07, + "loss": 1.6103, + "step": 15980 + }, + { + "epoch": 0.5910929678408482, + "grad_norm": 2.1303907208230637, + "learning_rate": 2.122550882164552e-07, + "loss": 1.6515, + "step": 16000 + }, + { + "epoch": 0.5918318340506493, + "grad_norm": 1.4309458414094776, + "learning_rate": 2.1175369332577075e-07, + "loss": 1.6476, + "step": 16020 + }, + { + "epoch": 0.5925707002604503, + "grad_norm": 1.3885096209200305, + "learning_rate": 2.112526392215811e-07, + "loss": 1.6161, + "step": 16040 + }, + { + "epoch": 0.5933095664702513, + "grad_norm": 1.4639170589501997, + "learning_rate": 2.107519286036879e-07, + "loss": 1.6626, + "step": 16060 + }, + { + "epoch": 0.5940484326800525, + "grad_norm": 1.5413296048888148, + "learning_rate": 2.102515641700417e-07, + "loss": 1.7111, + "step": 16080 + }, + { + "epoch": 0.5947872988898535, + "grad_norm": 1.477261253181655, + "learning_rate": 2.0975154861672782e-07, + "loss": 1.6606, + "step": 16100 + }, + { + "epoch": 0.5955261650996546, + "grad_norm": 1.484117052461405, + "learning_rate": 2.0925188463795195e-07, + "loss": 1.6587, + "step": 16120 + }, + { + "epoch": 0.5962650313094556, + "grad_norm": 1.492261770923395, + "learning_rate": 2.0875257492602505e-07, + "loss": 1.629, + "step": 16140 + }, + { + "epoch": 0.5970038975192568, + "grad_norm": 1.4469424063226348, + "learning_rate": 2.082536221713494e-07, + "loss": 1.6496, + "step": 16160 + }, + { + "epoch": 0.5977427637290578, + "grad_norm": 1.6092362505845061, + "learning_rate": 2.07755029062404e-07, + "loss": 1.6664, + "step": 16180 + }, + { + "epoch": 0.5984816299388588, + "grad_norm": 1.779958420465131, + "learning_rate": 2.0725679828572983e-07, + "loss": 1.6212, + "step": 16200 + }, + { + "epoch": 0.5992204961486599, + "grad_norm": 2.256981377181274, + "learning_rate": 2.0675893252591558e-07, + "loss": 1.6603, + "step": 16220 + }, + { + "epoch": 0.5999593623584609, + "grad_norm": 1.4438145967369689, + "learning_rate": 2.0626143446558313e-07, + "loss": 1.7086, + "step": 16240 + }, + { + "epoch": 0.600698228568262, + "grad_norm": 1.4523681015745287, + "learning_rate": 2.0576430678537314e-07, + "loss": 1.6363, + "step": 16260 + }, + { + "epoch": 0.6014370947780631, + "grad_norm": 2.081965836536827, + "learning_rate": 2.052675521639306e-07, + "loss": 1.6525, + "step": 16280 + }, + { + "epoch": 0.6021759609878641, + "grad_norm": 1.641105539346371, + "learning_rate": 2.0477117327789017e-07, + "loss": 1.7219, + "step": 16300 + }, + { + "epoch": 0.6029148271976652, + "grad_norm": 2.1960028742429887, + "learning_rate": 2.0427517280186225e-07, + "loss": 1.7079, + "step": 16320 + }, + { + "epoch": 0.6036536934074662, + "grad_norm": 1.421358868551972, + "learning_rate": 2.0377955340841817e-07, + "loss": 1.6494, + "step": 16340 + }, + { + "epoch": 0.6043925596172673, + "grad_norm": 1.4519180712299584, + "learning_rate": 2.032843177680757e-07, + "loss": 1.6497, + "step": 16360 + }, + { + "epoch": 0.6051314258270684, + "grad_norm": 1.4554186364319244, + "learning_rate": 2.0278946854928512e-07, + "loss": 1.6623, + "step": 16380 + }, + { + "epoch": 0.6058702920368694, + "grad_norm": 1.453630709571824, + "learning_rate": 2.022950084184145e-07, + "loss": 1.6481, + "step": 16400 + }, + { + "epoch": 0.6066091582466705, + "grad_norm": 1.504491667770329, + "learning_rate": 2.018009400397353e-07, + "loss": 1.677, + "step": 16420 + }, + { + "epoch": 0.6073480244564715, + "grad_norm": 1.388924417705384, + "learning_rate": 2.0130726607540828e-07, + "loss": 1.6496, + "step": 16440 + }, + { + "epoch": 0.6080868906662726, + "grad_norm": 1.464940095501643, + "learning_rate": 2.0081398918546882e-07, + "loss": 1.6999, + "step": 16460 + }, + { + "epoch": 0.6088257568760737, + "grad_norm": 1.7055463049168984, + "learning_rate": 2.0032111202781282e-07, + "loss": 1.6249, + "step": 16480 + }, + { + "epoch": 0.6095646230858748, + "grad_norm": 1.6279220224411552, + "learning_rate": 1.9982863725818267e-07, + "loss": 1.6285, + "step": 16500 + }, + { + "epoch": 0.6103034892956758, + "grad_norm": 2.0351245502127404, + "learning_rate": 1.9933656753015204e-07, + "loss": 1.6595, + "step": 16520 + }, + { + "epoch": 0.6110423555054768, + "grad_norm": 2.018723900559302, + "learning_rate": 1.9884490549511252e-07, + "loss": 1.7325, + "step": 16540 + }, + { + "epoch": 0.6117812217152779, + "grad_norm": 1.4930972850593807, + "learning_rate": 1.983782066004026e-07, + "loss": 1.6739, + "step": 16560 + }, + { + "epoch": 0.612520087925079, + "grad_norm": 1.6719536221986355, + "learning_rate": 1.9788734718442834e-07, + "loss": 1.6453, + "step": 16580 + }, + { + "epoch": 0.6132589541348801, + "grad_norm": 1.5901664783269642, + "learning_rate": 1.9739690327019692e-07, + "loss": 1.6688, + "step": 16600 + }, + { + "epoch": 0.6139978203446811, + "grad_norm": 1.5005389488409309, + "learning_rate": 1.9693136881713379e-07, + "loss": 1.6697, + "step": 16620 + }, + { + "epoch": 0.6147366865544821, + "grad_norm": 1.5857034959363703, + "learning_rate": 1.9644174273011738e-07, + "loss": 1.6639, + "step": 16640 + }, + { + "epoch": 0.6154755527642832, + "grad_norm": 1.8800052700521002, + "learning_rate": 1.959525399341126e-07, + "loss": 1.6406, + "step": 16660 + }, + { + "epoch": 0.6162144189740842, + "grad_norm": 1.5463318718925796, + "learning_rate": 1.954637630650633e-07, + "loss": 1.6456, + "step": 16680 + }, + { + "epoch": 0.6169532851838854, + "grad_norm": 1.7265411721417883, + "learning_rate": 1.9497541475661822e-07, + "loss": 1.6396, + "step": 16700 + }, + { + "epoch": 0.6176921513936864, + "grad_norm": 1.6019332231293413, + "learning_rate": 1.9448749764011674e-07, + "loss": 1.6319, + "step": 16720 + }, + { + "epoch": 0.6184310176034874, + "grad_norm": 1.6078339500202126, + "learning_rate": 1.940000143445753e-07, + "loss": 1.6287, + "step": 16740 + }, + { + "epoch": 0.6191698838132885, + "grad_norm": 1.5200063311449286, + "learning_rate": 1.9351296749667239e-07, + "loss": 1.6556, + "step": 16760 + }, + { + "epoch": 0.6199087500230895, + "grad_norm": 1.5605900758303721, + "learning_rate": 1.9302635972073504e-07, + "loss": 1.6709, + "step": 16780 + }, + { + "epoch": 0.6206476162328907, + "grad_norm": 1.5245501861602075, + "learning_rate": 1.9254019363872432e-07, + "loss": 1.6744, + "step": 16800 + }, + { + "epoch": 0.6213864824426917, + "grad_norm": 1.4527294863239084, + "learning_rate": 1.9205447187022145e-07, + "loss": 1.6564, + "step": 16820 + }, + { + "epoch": 0.6221253486524928, + "grad_norm": 2.0368137299260276, + "learning_rate": 1.915691970324137e-07, + "loss": 1.6289, + "step": 16840 + }, + { + "epoch": 0.6228642148622938, + "grad_norm": 2.2640348268112147, + "learning_rate": 1.9108437174007967e-07, + "loss": 1.667, + "step": 16860 + }, + { + "epoch": 0.6236030810720948, + "grad_norm": 1.4879411305430876, + "learning_rate": 1.9059999860557635e-07, + "loss": 1.6516, + "step": 16880 + }, + { + "epoch": 0.624341947281896, + "grad_norm": 1.99321589038771, + "learning_rate": 1.9011608023882396e-07, + "loss": 1.6617, + "step": 16900 + }, + { + "epoch": 0.625080813491697, + "grad_norm": 1.4486992732108148, + "learning_rate": 1.8963261924729247e-07, + "loss": 1.6477, + "step": 16920 + }, + { + "epoch": 0.6258196797014981, + "grad_norm": 1.4436779823541692, + "learning_rate": 1.8914961823598742e-07, + "loss": 1.6276, + "step": 16940 + }, + { + "epoch": 0.6265585459112991, + "grad_norm": 1.7823515681610929, + "learning_rate": 1.886670798074358e-07, + "loss": 1.6722, + "step": 16960 + }, + { + "epoch": 0.6272974121211001, + "grad_norm": 1.4559994514082784, + "learning_rate": 1.8818500656167198e-07, + "loss": 1.6721, + "step": 16980 + }, + { + "epoch": 0.6280362783309013, + "grad_norm": 1.5502170823927217, + "learning_rate": 1.8770340109622418e-07, + "loss": 1.6468, + "step": 17000 + }, + { + "epoch": 0.6287751445407023, + "grad_norm": 1.3693032988758314, + "learning_rate": 1.8722226600609974e-07, + "loss": 1.6503, + "step": 17020 + }, + { + "epoch": 0.6295140107505034, + "grad_norm": 1.8228163395950472, + "learning_rate": 1.8674160388377174e-07, + "loss": 1.6691, + "step": 17040 + }, + { + "epoch": 0.6302528769603044, + "grad_norm": 1.607512275964286, + "learning_rate": 1.8626141731916446e-07, + "loss": 1.6381, + "step": 17060 + }, + { + "epoch": 0.6309917431701054, + "grad_norm": 1.6555733853411483, + "learning_rate": 1.8578170889964022e-07, + "loss": 1.624, + "step": 17080 + }, + { + "epoch": 0.6317306093799065, + "grad_norm": 1.4667357369050853, + "learning_rate": 1.853024812099847e-07, + "loss": 1.6233, + "step": 17100 + }, + { + "epoch": 0.6324694755897076, + "grad_norm": 1.555065221242107, + "learning_rate": 1.8482373683239316e-07, + "loss": 1.6372, + "step": 17120 + }, + { + "epoch": 0.6332083417995087, + "grad_norm": 1.5169327799558363, + "learning_rate": 1.8434547834645714e-07, + "loss": 1.6738, + "step": 17140 + }, + { + "epoch": 0.6339472080093097, + "grad_norm": 1.419410682586359, + "learning_rate": 1.8386770832914955e-07, + "loss": 1.6677, + "step": 17160 + }, + { + "epoch": 0.6346860742191108, + "grad_norm": 1.6719841699284368, + "learning_rate": 1.833904293548116e-07, + "loss": 1.6821, + "step": 17180 + }, + { + "epoch": 0.6354249404289118, + "grad_norm": 1.5798183541162123, + "learning_rate": 1.8291364399513864e-07, + "loss": 1.7092, + "step": 17200 + }, + { + "epoch": 0.6361638066387129, + "grad_norm": 1.4604030691233605, + "learning_rate": 1.8243735481916611e-07, + "loss": 1.662, + "step": 17220 + }, + { + "epoch": 0.636902672848514, + "grad_norm": 1.7774575653306484, + "learning_rate": 1.8196156439325604e-07, + "loss": 1.655, + "step": 17240 + }, + { + "epoch": 0.637641539058315, + "grad_norm": 2.062948052538768, + "learning_rate": 1.8148627528108323e-07, + "loss": 1.65, + "step": 17260 + }, + { + "epoch": 0.6383804052681161, + "grad_norm": 1.7560243016328074, + "learning_rate": 1.8101149004362088e-07, + "loss": 1.6068, + "step": 17280 + }, + { + "epoch": 0.6391192714779171, + "grad_norm": 1.589922555292764, + "learning_rate": 1.8053721123912764e-07, + "loss": 1.6432, + "step": 17300 + }, + { + "epoch": 0.6398581376877182, + "grad_norm": 1.7855781248038047, + "learning_rate": 1.8006344142313285e-07, + "loss": 1.6444, + "step": 17320 + }, + { + "epoch": 0.6405970038975193, + "grad_norm": 1.462859488532895, + "learning_rate": 1.7959018314842395e-07, + "loss": 1.6225, + "step": 17340 + }, + { + "epoch": 0.6413358701073203, + "grad_norm": 1.5201929263554286, + "learning_rate": 1.7911743896503144e-07, + "loss": 1.6216, + "step": 17360 + }, + { + "epoch": 0.6420747363171214, + "grad_norm": 1.5039545520824391, + "learning_rate": 1.7864521142021616e-07, + "loss": 1.597, + "step": 17380 + }, + { + "epoch": 0.6428136025269224, + "grad_norm": 2.1198882531068106, + "learning_rate": 1.7817350305845503e-07, + "loss": 1.6762, + "step": 17400 + }, + { + "epoch": 0.6435524687367234, + "grad_norm": 1.5052045132821683, + "learning_rate": 1.7770231642142758e-07, + "loss": 1.6459, + "step": 17420 + }, + { + "epoch": 0.6442913349465246, + "grad_norm": 1.5702310750127326, + "learning_rate": 1.77231654048002e-07, + "loss": 1.5676, + "step": 17440 + }, + { + "epoch": 0.6450302011563256, + "grad_norm": 1.49975631121171, + "learning_rate": 1.7676151847422188e-07, + "loss": 1.6558, + "step": 17460 + }, + { + "epoch": 0.6457690673661267, + "grad_norm": 1.8852376014336283, + "learning_rate": 1.7629191223329188e-07, + "loss": 1.6598, + "step": 17480 + }, + { + "epoch": 0.6465079335759277, + "grad_norm": 1.5809036111526213, + "learning_rate": 1.7582283785556494e-07, + "loss": 1.6148, + "step": 17500 + }, + { + "epoch": 0.6472467997857289, + "grad_norm": 1.4247569077843545, + "learning_rate": 1.75354297868528e-07, + "loss": 1.6318, + "step": 17520 + }, + { + "epoch": 0.6479856659955299, + "grad_norm": 1.6577683592238937, + "learning_rate": 1.748862947967885e-07, + "loss": 1.6551, + "step": 17540 + }, + { + "epoch": 0.6487245322053309, + "grad_norm": 7.300032033927882, + "learning_rate": 1.744188311620608e-07, + "loss": 1.6892, + "step": 17560 + }, + { + "epoch": 0.649463398415132, + "grad_norm": 1.4132601163703873, + "learning_rate": 1.7395190948315282e-07, + "loss": 1.6817, + "step": 17580 + }, + { + "epoch": 0.650202264624933, + "grad_norm": 1.5063433467194194, + "learning_rate": 1.7348553227595218e-07, + "loss": 1.6158, + "step": 17600 + }, + { + "epoch": 0.6509411308347341, + "grad_norm": 1.5169596981657725, + "learning_rate": 1.7301970205341292e-07, + "loss": 1.6779, + "step": 17620 + }, + { + "epoch": 0.6516799970445352, + "grad_norm": 1.6068564294026548, + "learning_rate": 1.725544213255415e-07, + "loss": 1.6179, + "step": 17640 + }, + { + "epoch": 0.6524188632543362, + "grad_norm": 1.401533779590892, + "learning_rate": 1.7208969259938396e-07, + "loss": 1.6992, + "step": 17660 + }, + { + "epoch": 0.6531577294641373, + "grad_norm": 1.7940271180903984, + "learning_rate": 1.7162551837901149e-07, + "loss": 1.6343, + "step": 17680 + }, + { + "epoch": 0.6538965956739383, + "grad_norm": 1.4503762459176361, + "learning_rate": 1.7116190116550798e-07, + "loss": 1.6241, + "step": 17700 + }, + { + "epoch": 0.6546354618837394, + "grad_norm": 1.9129744363614924, + "learning_rate": 1.7069884345695585e-07, + "loss": 1.6242, + "step": 17720 + }, + { + "epoch": 0.6553743280935405, + "grad_norm": 1.4592502547252286, + "learning_rate": 1.7023634774842265e-07, + "loss": 1.6433, + "step": 17740 + }, + { + "epoch": 0.6561131943033415, + "grad_norm": 2.3740218695344026, + "learning_rate": 1.6977441653194778e-07, + "loss": 1.6407, + "step": 17760 + }, + { + "epoch": 0.6568520605131426, + "grad_norm": 1.652867656549423, + "learning_rate": 1.6931305229652911e-07, + "loss": 1.6571, + "step": 17780 + }, + { + "epoch": 0.6575909267229436, + "grad_norm": 1.8510532804043571, + "learning_rate": 1.688522575281096e-07, + "loss": 1.6393, + "step": 17800 + }, + { + "epoch": 0.6583297929327447, + "grad_norm": 1.5330852891820108, + "learning_rate": 1.6839203470956348e-07, + "loss": 1.6181, + "step": 17820 + }, + { + "epoch": 0.6590686591425458, + "grad_norm": 2.179872107638406, + "learning_rate": 1.6793238632068323e-07, + "loss": 1.6467, + "step": 17840 + }, + { + "epoch": 0.6598075253523468, + "grad_norm": 1.5709625450812563, + "learning_rate": 1.6747331483816645e-07, + "loss": 1.6931, + "step": 17860 + }, + { + "epoch": 0.6605463915621479, + "grad_norm": 1.7454282483475967, + "learning_rate": 1.6701482273560185e-07, + "loss": 1.6292, + "step": 17880 + }, + { + "epoch": 0.6612852577719489, + "grad_norm": 1.7594994883208979, + "learning_rate": 1.6655691248345655e-07, + "loss": 1.6171, + "step": 17900 + }, + { + "epoch": 0.66202412398175, + "grad_norm": 1.5140697252908892, + "learning_rate": 1.6609958654906255e-07, + "loss": 1.6319, + "step": 17920 + }, + { + "epoch": 0.662762990191551, + "grad_norm": 2.248352984954327, + "learning_rate": 1.6564284739660316e-07, + "loss": 1.6363, + "step": 17940 + }, + { + "epoch": 0.6635018564013522, + "grad_norm": 2.0596192177611368, + "learning_rate": 1.6518669748710013e-07, + "loss": 1.6264, + "step": 17960 + }, + { + "epoch": 0.6642407226111532, + "grad_norm": 1.4805518708471208, + "learning_rate": 1.647311392784002e-07, + "loss": 1.6559, + "step": 17980 + }, + { + "epoch": 0.6649795888209542, + "grad_norm": 1.5620227618208977, + "learning_rate": 1.6427617522516196e-07, + "loss": 1.6528, + "step": 18000 + }, + { + "epoch": 0.6657184550307553, + "grad_norm": 1.5698059903501222, + "learning_rate": 1.6382180777884236e-07, + "loss": 1.68, + "step": 18020 + }, + { + "epoch": 0.6664573212405563, + "grad_norm": 1.525456023190327, + "learning_rate": 1.6336803938768396e-07, + "loss": 1.6129, + "step": 18040 + }, + { + "epoch": 0.6671961874503575, + "grad_norm": 1.9244616810959143, + "learning_rate": 1.6291487249670116e-07, + "loss": 1.6074, + "step": 18060 + }, + { + "epoch": 0.6679350536601585, + "grad_norm": 1.5470316335951617, + "learning_rate": 1.6246230954766744e-07, + "loss": 1.6174, + "step": 18080 + }, + { + "epoch": 0.6686739198699595, + "grad_norm": 1.460047028189958, + "learning_rate": 1.6201035297910215e-07, + "loss": 1.6387, + "step": 18100 + }, + { + "epoch": 0.6694127860797606, + "grad_norm": 1.849597715575099, + "learning_rate": 1.6155900522625744e-07, + "loss": 1.6357, + "step": 18120 + }, + { + "epoch": 0.6701516522895616, + "grad_norm": 1.595432962229376, + "learning_rate": 1.6110826872110478e-07, + "loss": 1.6175, + "step": 18140 + }, + { + "epoch": 0.6708905184993628, + "grad_norm": 1.5318757576478021, + "learning_rate": 1.6065814589232206e-07, + "loss": 1.6235, + "step": 18160 + }, + { + "epoch": 0.6716293847091638, + "grad_norm": 1.4152502346247018, + "learning_rate": 1.602086391652807e-07, + "loss": 1.6287, + "step": 18180 + }, + { + "epoch": 0.6723682509189648, + "grad_norm": 1.730605954821045, + "learning_rate": 1.5975975096203248e-07, + "loss": 1.6297, + "step": 18200 + }, + { + "epoch": 0.6731071171287659, + "grad_norm": 1.641811600664541, + "learning_rate": 1.5931148370129613e-07, + "loss": 1.6575, + "step": 18220 + }, + { + "epoch": 0.6738459833385669, + "grad_norm": 1.4446876896322507, + "learning_rate": 1.5886383979844492e-07, + "loss": 1.6488, + "step": 18240 + }, + { + "epoch": 0.6745848495483681, + "grad_norm": 1.6489416268912538, + "learning_rate": 1.5841682166549308e-07, + "loss": 1.6466, + "step": 18260 + }, + { + "epoch": 0.6753237157581691, + "grad_norm": 1.6240331247999147, + "learning_rate": 1.5797043171108297e-07, + "loss": 1.6693, + "step": 18280 + }, + { + "epoch": 0.6760625819679702, + "grad_norm": 2.2147991050957, + "learning_rate": 1.5752467234047263e-07, + "loss": 1.6051, + "step": 18300 + }, + { + "epoch": 0.6768014481777712, + "grad_norm": 1.5203059720344088, + "learning_rate": 1.5707954595552187e-07, + "loss": 1.653, + "step": 18320 + }, + { + "epoch": 0.6775403143875722, + "grad_norm": 1.5328417599383586, + "learning_rate": 1.5663505495468e-07, + "loss": 1.6381, + "step": 18340 + }, + { + "epoch": 0.6782791805973734, + "grad_norm": 1.5445956099646183, + "learning_rate": 1.5619120173297267e-07, + "loss": 1.6037, + "step": 18360 + }, + { + "epoch": 0.6790180468071744, + "grad_norm": 1.479872310550016, + "learning_rate": 1.5574798868198912e-07, + "loss": 1.6353, + "step": 18380 + }, + { + "epoch": 0.6797569130169755, + "grad_norm": 1.7841436633262773, + "learning_rate": 1.5530541818986927e-07, + "loss": 1.7364, + "step": 18400 + }, + { + "epoch": 0.6804957792267765, + "grad_norm": 1.529508435392583, + "learning_rate": 1.5486349264129046e-07, + "loss": 1.6181, + "step": 18420 + }, + { + "epoch": 0.6812346454365775, + "grad_norm": 1.6539396952625665, + "learning_rate": 1.5442221441745533e-07, + "loss": 1.6985, + "step": 18440 + }, + { + "epoch": 0.6819735116463786, + "grad_norm": 1.5860780535239207, + "learning_rate": 1.5398158589607813e-07, + "loss": 1.6636, + "step": 18460 + }, + { + "epoch": 0.6827123778561797, + "grad_norm": 1.9353694955508953, + "learning_rate": 1.5354160945137268e-07, + "loss": 1.6277, + "step": 18480 + }, + { + "epoch": 0.6834512440659808, + "grad_norm": 1.4060414431962835, + "learning_rate": 1.5310228745403925e-07, + "loss": 1.6348, + "step": 18500 + }, + { + "epoch": 0.6841901102757818, + "grad_norm": 1.9510007446700244, + "learning_rate": 1.5266362227125164e-07, + "loss": 1.666, + "step": 18520 + }, + { + "epoch": 0.6849289764855828, + "grad_norm": 2.5976331102164694, + "learning_rate": 1.5222561626664448e-07, + "loss": 1.6437, + "step": 18540 + }, + { + "epoch": 0.6856678426953839, + "grad_norm": 1.635565277090673, + "learning_rate": 1.51788271800301e-07, + "loss": 1.6367, + "step": 18560 + }, + { + "epoch": 0.686406708905185, + "grad_norm": 1.6414633412876904, + "learning_rate": 1.5135159122873936e-07, + "loss": 1.6239, + "step": 18580 + }, + { + "epoch": 0.6871455751149861, + "grad_norm": 1.972663651970077, + "learning_rate": 1.5091557690490104e-07, + "loss": 1.6551, + "step": 18600 + }, + { + "epoch": 0.6878844413247871, + "grad_norm": 1.376913066395765, + "learning_rate": 1.504802311781371e-07, + "loss": 1.6494, + "step": 18620 + }, + { + "epoch": 0.6886233075345882, + "grad_norm": 1.441207784040776, + "learning_rate": 1.5004555639419648e-07, + "loss": 1.6697, + "step": 18640 + }, + { + "epoch": 0.6893621737443892, + "grad_norm": 2.5475644652288514, + "learning_rate": 1.4961155489521253e-07, + "loss": 1.6449, + "step": 18660 + }, + { + "epoch": 0.6901010399541903, + "grad_norm": 1.4330764200962958, + "learning_rate": 1.4917822901969108e-07, + "loss": 1.5962, + "step": 18680 + }, + { + "epoch": 0.6908399061639914, + "grad_norm": 1.5535375552432238, + "learning_rate": 1.487455811024975e-07, + "loss": 1.6682, + "step": 18700 + }, + { + "epoch": 0.6915787723737924, + "grad_norm": 1.5430558472764233, + "learning_rate": 1.4831361347484396e-07, + "loss": 1.6646, + "step": 18720 + }, + { + "epoch": 0.6923176385835935, + "grad_norm": 1.5354124537032656, + "learning_rate": 1.4788232846427718e-07, + "loss": 1.6569, + "step": 18740 + }, + { + "epoch": 0.6930565047933945, + "grad_norm": 1.723896126450271, + "learning_rate": 1.474517283946658e-07, + "loss": 1.6694, + "step": 18760 + }, + { + "epoch": 0.6937953710031955, + "grad_norm": 1.4743738549149994, + "learning_rate": 1.4702181558618777e-07, + "loss": 1.6161, + "step": 18780 + }, + { + "epoch": 0.6945342372129967, + "grad_norm": 1.675747008439809, + "learning_rate": 1.4659259235531796e-07, + "loss": 1.6558, + "step": 18800 + }, + { + "epoch": 0.6952731034227977, + "grad_norm": 1.760786257067446, + "learning_rate": 1.4616406101481574e-07, + "loss": 1.5887, + "step": 18820 + }, + { + "epoch": 0.6960119696325988, + "grad_norm": 2.8049367365120608, + "learning_rate": 1.4573622387371217e-07, + "loss": 1.6649, + "step": 18840 + }, + { + "epoch": 0.6967508358423998, + "grad_norm": 1.496529351669967, + "learning_rate": 1.4530908323729782e-07, + "loss": 1.6433, + "step": 18860 + }, + { + "epoch": 0.6974897020522008, + "grad_norm": 1.4994802420062043, + "learning_rate": 1.448826414071105e-07, + "loss": 1.6841, + "step": 18880 + }, + { + "epoch": 0.698228568262002, + "grad_norm": 1.420851366464802, + "learning_rate": 1.4445690068092265e-07, + "loss": 1.6504, + "step": 18900 + }, + { + "epoch": 0.698967434471803, + "grad_norm": 1.7411191806669424, + "learning_rate": 1.4403186335272888e-07, + "loss": 1.6298, + "step": 18920 + }, + { + "epoch": 0.6997063006816041, + "grad_norm": 1.628227507992112, + "learning_rate": 1.4360753171273364e-07, + "loss": 1.673, + "step": 18940 + }, + { + "epoch": 0.7004451668914051, + "grad_norm": 1.7368645634603777, + "learning_rate": 1.4318390804733927e-07, + "loss": 1.6198, + "step": 18960 + }, + { + "epoch": 0.7011840331012062, + "grad_norm": 1.4616447916754742, + "learning_rate": 1.4276099463913315e-07, + "loss": 1.6096, + "step": 18980 + }, + { + "epoch": 0.7019228993110073, + "grad_norm": 1.517480098110094, + "learning_rate": 1.4233879376687563e-07, + "loss": 1.6345, + "step": 19000 + }, + { + "epoch": 0.7026617655208083, + "grad_norm": 1.636195025828432, + "learning_rate": 1.419173077054878e-07, + "loss": 1.6119, + "step": 19020 + }, + { + "epoch": 0.7034006317306094, + "grad_norm": 1.5039586339840252, + "learning_rate": 1.4149653872603917e-07, + "loss": 1.7208, + "step": 19040 + }, + { + "epoch": 0.7041394979404104, + "grad_norm": 1.4728764699529369, + "learning_rate": 1.410764890957353e-07, + "loss": 1.6572, + "step": 19060 + }, + { + "epoch": 0.7048783641502115, + "grad_norm": 1.9218697223400836, + "learning_rate": 1.406571610779059e-07, + "loss": 1.6514, + "step": 19080 + }, + { + "epoch": 0.7056172303600126, + "grad_norm": 1.5761294021476189, + "learning_rate": 1.4023855693199254e-07, + "loss": 1.6381, + "step": 19100 + }, + { + "epoch": 0.7063560965698136, + "grad_norm": 1.435908518604352, + "learning_rate": 1.398206789135361e-07, + "loss": 1.6126, + "step": 19120 + }, + { + "epoch": 0.7070949627796147, + "grad_norm": 4.717577212666518, + "learning_rate": 1.3940352927416504e-07, + "loss": 1.6647, + "step": 19140 + }, + { + "epoch": 0.7078338289894157, + "grad_norm": 2.1188904047967245, + "learning_rate": 1.3898711026158323e-07, + "loss": 1.6794, + "step": 19160 + }, + { + "epoch": 0.7085726951992168, + "grad_norm": 1.5687418673344722, + "learning_rate": 1.3857142411955767e-07, + "loss": 1.6474, + "step": 19180 + }, + { + "epoch": 0.7093115614090179, + "grad_norm": 1.6271449022527302, + "learning_rate": 1.381564730879064e-07, + "loss": 1.6347, + "step": 19200 + }, + { + "epoch": 0.7100504276188189, + "grad_norm": 1.4693942372788273, + "learning_rate": 1.377422594024867e-07, + "loss": 1.6474, + "step": 19220 + }, + { + "epoch": 0.71078929382862, + "grad_norm": 1.488154969512232, + "learning_rate": 1.373287852951826e-07, + "loss": 1.6128, + "step": 19240 + }, + { + "epoch": 0.711528160038421, + "grad_norm": 1.5779135188256272, + "learning_rate": 1.3691605299389328e-07, + "loss": 1.7183, + "step": 19260 + }, + { + "epoch": 0.7122670262482221, + "grad_norm": 1.6650630460525442, + "learning_rate": 1.3650406472252083e-07, + "loss": 1.6683, + "step": 19280 + }, + { + "epoch": 0.7130058924580231, + "grad_norm": 1.4154650117196357, + "learning_rate": 1.360928227009584e-07, + "loss": 1.6717, + "step": 19300 + }, + { + "epoch": 0.7137447586678243, + "grad_norm": 1.6468623503038222, + "learning_rate": 1.3568232914507802e-07, + "loss": 1.6348, + "step": 19320 + }, + { + "epoch": 0.7144836248776253, + "grad_norm": 1.5015397491680238, + "learning_rate": 1.3527258626671898e-07, + "loss": 1.6112, + "step": 19340 + }, + { + "epoch": 0.7152224910874263, + "grad_norm": 3.3400095996186865, + "learning_rate": 1.348635962736755e-07, + "loss": 1.6523, + "step": 19360 + }, + { + "epoch": 0.7159613572972274, + "grad_norm": 1.5139103946143873, + "learning_rate": 1.344553613696854e-07, + "loss": 1.6941, + "step": 19380 + }, + { + "epoch": 0.7167002235070284, + "grad_norm": 1.4051928644539238, + "learning_rate": 1.340478837544175e-07, + "loss": 1.6237, + "step": 19400 + }, + { + "epoch": 0.7174390897168296, + "grad_norm": 1.5234389161550645, + "learning_rate": 1.3364116562346055e-07, + "loss": 1.6559, + "step": 19420 + }, + { + "epoch": 0.7181779559266306, + "grad_norm": 1.4205504198026582, + "learning_rate": 1.3323520916831077e-07, + "loss": 1.6478, + "step": 19440 + }, + { + "epoch": 0.7189168221364316, + "grad_norm": 1.5989862880917087, + "learning_rate": 1.328300165763602e-07, + "loss": 1.6123, + "step": 19460 + }, + { + "epoch": 0.7196556883462327, + "grad_norm": 1.6443108557654487, + "learning_rate": 1.3242559003088546e-07, + "loss": 1.6832, + "step": 19480 + }, + { + "epoch": 0.7203945545560337, + "grad_norm": 1.3202697054517272, + "learning_rate": 1.3202193171103506e-07, + "loss": 1.6339, + "step": 19500 + }, + { + "epoch": 0.7211334207658349, + "grad_norm": 1.5006943077767945, + "learning_rate": 1.316190437918182e-07, + "loss": 1.6469, + "step": 19520 + }, + { + "epoch": 0.7218722869756359, + "grad_norm": 1.7379534891877164, + "learning_rate": 1.3121692844409321e-07, + "loss": 1.6797, + "step": 19540 + }, + { + "epoch": 0.7226111531854369, + "grad_norm": 1.526373724090785, + "learning_rate": 1.308155878345553e-07, + "loss": 1.6636, + "step": 19560 + }, + { + "epoch": 0.723350019395238, + "grad_norm": 2.0046685771285793, + "learning_rate": 1.3041502412572542e-07, + "loss": 1.6748, + "step": 19580 + }, + { + "epoch": 0.724088885605039, + "grad_norm": 1.4955882650728989, + "learning_rate": 1.3001523947593845e-07, + "loss": 1.6293, + "step": 19600 + }, + { + "epoch": 0.7248277518148402, + "grad_norm": 2.4302511767713324, + "learning_rate": 1.2961623603933134e-07, + "loss": 1.6004, + "step": 19620 + }, + { + "epoch": 0.7255666180246412, + "grad_norm": 1.6494154347871601, + "learning_rate": 1.2921801596583153e-07, + "loss": 1.6136, + "step": 19640 + }, + { + "epoch": 0.7263054842344423, + "grad_norm": 1.4459727786023948, + "learning_rate": 1.2882058140114594e-07, + "loss": 1.6435, + "step": 19660 + }, + { + "epoch": 0.7270443504442433, + "grad_norm": 1.4490955525578755, + "learning_rate": 1.2842393448674869e-07, + "loss": 1.6508, + "step": 19680 + }, + { + "epoch": 0.7277832166540443, + "grad_norm": 1.7167939191812815, + "learning_rate": 1.280280773598699e-07, + "loss": 1.6299, + "step": 19700 + }, + { + "epoch": 0.7285220828638455, + "grad_norm": 1.8412228497101617, + "learning_rate": 1.2763301215348402e-07, + "loss": 1.6758, + "step": 19720 + }, + { + "epoch": 0.7292609490736465, + "grad_norm": 1.6407591339864582, + "learning_rate": 1.2723874099629866e-07, + "loss": 1.6443, + "step": 19740 + }, + { + "epoch": 0.7299998152834476, + "grad_norm": 2.010243920808459, + "learning_rate": 1.268452660127427e-07, + "loss": 1.6317, + "step": 19760 + }, + { + "epoch": 0.7307386814932486, + "grad_norm": 1.521357800662826, + "learning_rate": 1.2645258932295518e-07, + "loss": 1.6162, + "step": 19780 + }, + { + "epoch": 0.7314775477030496, + "grad_norm": 1.5657714545631887, + "learning_rate": 1.260607130427737e-07, + "loss": 1.6134, + "step": 19800 + }, + { + "epoch": 0.7322164139128507, + "grad_norm": 1.7902489767561236, + "learning_rate": 1.2566963928372308e-07, + "loss": 1.6633, + "step": 19820 + }, + { + "epoch": 0.7329552801226518, + "grad_norm": 2.0435731294538466, + "learning_rate": 1.2527937015300378e-07, + "loss": 1.6505, + "step": 19840 + }, + { + "epoch": 0.7336941463324529, + "grad_norm": 5.207754525218824, + "learning_rate": 1.2488990775348092e-07, + "loss": 1.6453, + "step": 19860 + }, + { + "epoch": 0.7344330125422539, + "grad_norm": 1.6112840529464336, + "learning_rate": 1.245012541836728e-07, + "loss": 1.6082, + "step": 19880 + }, + { + "epoch": 0.7351718787520549, + "grad_norm": 1.4827262765821532, + "learning_rate": 1.241134115377394e-07, + "loss": 1.6161, + "step": 19900 + }, + { + "epoch": 0.735910744961856, + "grad_norm": 1.7219801506968755, + "learning_rate": 1.2372638190547122e-07, + "loss": 1.6305, + "step": 19920 + }, + { + "epoch": 0.7366496111716571, + "grad_norm": 1.3720219936046893, + "learning_rate": 1.233401673722782e-07, + "loss": 1.6099, + "step": 19940 + }, + { + "epoch": 0.7373884773814582, + "grad_norm": 1.7432612385035637, + "learning_rate": 1.229547700191783e-07, + "loss": 1.6372, + "step": 19960 + }, + { + "epoch": 0.7381273435912592, + "grad_norm": 3.2034872788326925, + "learning_rate": 1.2257019192278617e-07, + "loss": 1.6147, + "step": 19980 + }, + { + "epoch": 0.7388662098010603, + "grad_norm": 1.6442745462596664, + "learning_rate": 1.2218643515530227e-07, + "loss": 1.6344, + "step": 20000 + }, + { + "epoch": 0.7396050760108613, + "grad_norm": 1.959102806007239, + "learning_rate": 1.218035017845015e-07, + "loss": 1.6451, + "step": 20020 + }, + { + "epoch": 0.7403439422206624, + "grad_norm": 1.6408059937998853, + "learning_rate": 1.214213938737219e-07, + "loss": 1.6757, + "step": 20040 + }, + { + "epoch": 0.7410828084304635, + "grad_norm": 1.5657243128524525, + "learning_rate": 1.210591578161399e-07, + "loss": 1.6359, + "step": 20060 + }, + { + "epoch": 0.7418216746402645, + "grad_norm": 1.4736673628427441, + "learning_rate": 1.2067866547022443e-07, + "loss": 1.6603, + "step": 20080 + }, + { + "epoch": 0.7425605408500656, + "grad_norm": 1.4833315219916223, + "learning_rate": 1.2029900464522203e-07, + "loss": 1.6342, + "step": 20100 + }, + { + "epoch": 0.7432994070598666, + "grad_norm": 1.9340686772443259, + "learning_rate": 1.1992017738683768e-07, + "loss": 1.6416, + "step": 20120 + }, + { + "epoch": 0.7440382732696676, + "grad_norm": 1.6355654798248513, + "learning_rate": 1.1954218573628499e-07, + "loss": 1.6678, + "step": 20140 + }, + { + "epoch": 0.7447771394794688, + "grad_norm": 1.5624481100138734, + "learning_rate": 1.1916503173027475e-07, + "loss": 1.614, + "step": 20160 + }, + { + "epoch": 0.7455160056892698, + "grad_norm": 1.5029974061648055, + "learning_rate": 1.1878871740100476e-07, + "loss": 1.639, + "step": 20180 + }, + { + "epoch": 0.7462548718990709, + "grad_norm": 1.4683397727523646, + "learning_rate": 1.1841324477614812e-07, + "loss": 1.6516, + "step": 20200 + }, + { + "epoch": 0.7469937381088719, + "grad_norm": 1.478703041295488, + "learning_rate": 1.1803861587884268e-07, + "loss": 1.7247, + "step": 20220 + }, + { + "epoch": 0.7477326043186729, + "grad_norm": 1.4765169074470068, + "learning_rate": 1.1766483272768017e-07, + "loss": 1.6786, + "step": 20240 + }, + { + "epoch": 0.7484714705284741, + "grad_norm": 1.3861683142566674, + "learning_rate": 1.1729189733669528e-07, + "loss": 1.6242, + "step": 20260 + }, + { + "epoch": 0.7492103367382751, + "grad_norm": 1.5107470749741048, + "learning_rate": 1.1691981171535459e-07, + "loss": 1.6476, + "step": 20280 + }, + { + "epoch": 0.7499492029480762, + "grad_norm": 1.5696138247640767, + "learning_rate": 1.1654857786854591e-07, + "loss": 1.6691, + "step": 20300 + } + ], + "logging_steps": 20, + "max_steps": 27068, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 6767, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3859009492746240.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}