diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24589 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7787942316244203, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000508226923321263, + "grad_norm": 11.132791519165039, + "learning_rate": 2.0325203252032523e-08, + "loss": 1.5488, + "step": 1 + }, + { + "epoch": 0.001016453846642526, + "grad_norm": 14.91269588470459, + "learning_rate": 4.0650406504065046e-08, + "loss": 1.7579, + "step": 2 + }, + { + "epoch": 0.0015246807699637887, + "grad_norm": 14.403929710388184, + "learning_rate": 6.097560975609757e-08, + "loss": 1.5603, + "step": 3 + }, + { + "epoch": 0.002032907693285052, + "grad_norm": 11.182633399963379, + "learning_rate": 8.130081300813009e-08, + "loss": 1.6072, + "step": 4 + }, + { + "epoch": 0.0025411346166063146, + "grad_norm": 13.164581298828125, + "learning_rate": 1.0162601626016261e-07, + "loss": 1.5791, + "step": 5 + }, + { + "epoch": 0.0030493615399275775, + "grad_norm": 16.108863830566406, + "learning_rate": 1.2195121951219514e-07, + "loss": 1.5734, + "step": 6 + }, + { + "epoch": 0.0035575884632488407, + "grad_norm": 12.01136302947998, + "learning_rate": 1.4227642276422766e-07, + "loss": 1.5399, + "step": 7 + }, + { + "epoch": 0.004065815386570104, + "grad_norm": 11.824952125549316, + "learning_rate": 1.6260162601626018e-07, + "loss": 1.5702, + "step": 8 + }, + { + "epoch": 0.004574042309891366, + "grad_norm": 11.490732192993164, + "learning_rate": 1.8292682926829268e-07, + "loss": 1.4625, + "step": 9 + }, + { + "epoch": 0.005082269233212629, + "grad_norm": 14.02952766418457, + "learning_rate": 2.0325203252032523e-07, + "loss": 1.6021, + "step": 10 + }, + { + "epoch": 0.005590496156533892, + "grad_norm": 13.60211181640625, + "learning_rate": 2.2357723577235775e-07, + "loss": 1.5947, + "step": 11 + }, + { + "epoch": 0.006098723079855155, + "grad_norm": 10.582362174987793, + "learning_rate": 2.439024390243903e-07, + "loss": 1.4733, + "step": 12 + }, + { + "epoch": 0.006606950003176419, + "grad_norm": 11.117897987365723, + "learning_rate": 2.642276422764228e-07, + "loss": 1.6174, + "step": 13 + }, + { + "epoch": 0.0071151769264976815, + "grad_norm": 10.191961288452148, + "learning_rate": 2.845528455284553e-07, + "loss": 1.4654, + "step": 14 + }, + { + "epoch": 0.007623403849818944, + "grad_norm": 14.525583267211914, + "learning_rate": 3.0487804878048784e-07, + "loss": 1.6612, + "step": 15 + }, + { + "epoch": 0.008131630773140207, + "grad_norm": 12.473858833312988, + "learning_rate": 3.2520325203252037e-07, + "loss": 1.6033, + "step": 16 + }, + { + "epoch": 0.008639857696461471, + "grad_norm": 11.088489532470703, + "learning_rate": 3.455284552845529e-07, + "loss": 1.4625, + "step": 17 + }, + { + "epoch": 0.009148084619782733, + "grad_norm": 13.150553703308105, + "learning_rate": 3.6585365853658536e-07, + "loss": 1.6221, + "step": 18 + }, + { + "epoch": 0.009656311543103997, + "grad_norm": 10.35750675201416, + "learning_rate": 3.8617886178861793e-07, + "loss": 1.4358, + "step": 19 + }, + { + "epoch": 0.010164538466425259, + "grad_norm": 11.396235466003418, + "learning_rate": 4.0650406504065046e-07, + "loss": 1.491, + "step": 20 + }, + { + "epoch": 0.010672765389746522, + "grad_norm": 10.519694328308105, + "learning_rate": 4.26829268292683e-07, + "loss": 1.5748, + "step": 21 + }, + { + "epoch": 0.011180992313067784, + "grad_norm": 12.369754791259766, + "learning_rate": 4.471544715447155e-07, + "loss": 1.5489, + "step": 22 + }, + { + "epoch": 0.011689219236389048, + "grad_norm": 10.128881454467773, + "learning_rate": 4.6747967479674797e-07, + "loss": 1.5057, + "step": 23 + }, + { + "epoch": 0.01219744615971031, + "grad_norm": 8.99166488647461, + "learning_rate": 4.878048780487805e-07, + "loss": 1.5258, + "step": 24 + }, + { + "epoch": 0.012705673083031574, + "grad_norm": 12.331857681274414, + "learning_rate": 5.081300813008131e-07, + "loss": 1.3793, + "step": 25 + }, + { + "epoch": 0.013213900006352837, + "grad_norm": 7.486877918243408, + "learning_rate": 5.284552845528456e-07, + "loss": 1.4606, + "step": 26 + }, + { + "epoch": 0.0137221269296741, + "grad_norm": 9.731522560119629, + "learning_rate": 5.487804878048781e-07, + "loss": 1.4973, + "step": 27 + }, + { + "epoch": 0.014230353852995363, + "grad_norm": 6.014042854309082, + "learning_rate": 5.691056910569106e-07, + "loss": 1.487, + "step": 28 + }, + { + "epoch": 0.014738580776316625, + "grad_norm": 6.246473789215088, + "learning_rate": 5.894308943089432e-07, + "loss": 1.4415, + "step": 29 + }, + { + "epoch": 0.015246807699637889, + "grad_norm": 5.654910087585449, + "learning_rate": 6.097560975609757e-07, + "loss": 1.506, + "step": 30 + }, + { + "epoch": 0.01575503462295915, + "grad_norm": 5.190532684326172, + "learning_rate": 6.300813008130081e-07, + "loss": 1.4196, + "step": 31 + }, + { + "epoch": 0.016263261546280414, + "grad_norm": 5.3967461585998535, + "learning_rate": 6.504065040650407e-07, + "loss": 1.4139, + "step": 32 + }, + { + "epoch": 0.016771488469601678, + "grad_norm": 5.363631725311279, + "learning_rate": 6.707317073170733e-07, + "loss": 1.4304, + "step": 33 + }, + { + "epoch": 0.017279715392922942, + "grad_norm": 4.950409889221191, + "learning_rate": 6.910569105691058e-07, + "loss": 1.3548, + "step": 34 + }, + { + "epoch": 0.017787942316244202, + "grad_norm": 5.297672271728516, + "learning_rate": 7.113821138211383e-07, + "loss": 1.4669, + "step": 35 + }, + { + "epoch": 0.018296169239565466, + "grad_norm": 5.159802436828613, + "learning_rate": 7.317073170731707e-07, + "loss": 1.4151, + "step": 36 + }, + { + "epoch": 0.01880439616288673, + "grad_norm": 4.77419900894165, + "learning_rate": 7.520325203252033e-07, + "loss": 1.379, + "step": 37 + }, + { + "epoch": 0.019312623086207993, + "grad_norm": 4.516266822814941, + "learning_rate": 7.723577235772359e-07, + "loss": 1.3126, + "step": 38 + }, + { + "epoch": 0.019820850009529253, + "grad_norm": 4.660902976989746, + "learning_rate": 7.926829268292684e-07, + "loss": 1.4777, + "step": 39 + }, + { + "epoch": 0.020329076932850517, + "grad_norm": 4.3722968101501465, + "learning_rate": 8.130081300813009e-07, + "loss": 1.4056, + "step": 40 + }, + { + "epoch": 0.02083730385617178, + "grad_norm": 4.381669521331787, + "learning_rate": 8.333333333333333e-07, + "loss": 1.38, + "step": 41 + }, + { + "epoch": 0.021345530779493044, + "grad_norm": 4.524435520172119, + "learning_rate": 8.53658536585366e-07, + "loss": 1.4145, + "step": 42 + }, + { + "epoch": 0.021853757702814308, + "grad_norm": 6.599025726318359, + "learning_rate": 8.739837398373985e-07, + "loss": 1.3931, + "step": 43 + }, + { + "epoch": 0.02236198462613557, + "grad_norm": 4.480719566345215, + "learning_rate": 8.94308943089431e-07, + "loss": 1.3041, + "step": 44 + }, + { + "epoch": 0.022870211549456832, + "grad_norm": 4.4983906745910645, + "learning_rate": 9.146341463414634e-07, + "loss": 1.3611, + "step": 45 + }, + { + "epoch": 0.023378438472778096, + "grad_norm": 4.583948612213135, + "learning_rate": 9.349593495934959e-07, + "loss": 1.3255, + "step": 46 + }, + { + "epoch": 0.02388666539609936, + "grad_norm": 4.392378807067871, + "learning_rate": 9.552845528455287e-07, + "loss": 1.4201, + "step": 47 + }, + { + "epoch": 0.02439489231942062, + "grad_norm": 4.692641258239746, + "learning_rate": 9.75609756097561e-07, + "loss": 1.3912, + "step": 48 + }, + { + "epoch": 0.024903119242741883, + "grad_norm": 4.219020843505859, + "learning_rate": 9.959349593495935e-07, + "loss": 1.4172, + "step": 49 + }, + { + "epoch": 0.025411346166063147, + "grad_norm": 3.9937944412231445, + "learning_rate": 1.0162601626016261e-06, + "loss": 1.4778, + "step": 50 + }, + { + "epoch": 0.02591957308938441, + "grad_norm": 4.721486568450928, + "learning_rate": 1.0365853658536586e-06, + "loss": 1.3501, + "step": 51 + }, + { + "epoch": 0.026427800012705675, + "grad_norm": 4.057364463806152, + "learning_rate": 1.0569105691056912e-06, + "loss": 1.4107, + "step": 52 + }, + { + "epoch": 0.026936026936026935, + "grad_norm": 4.496649742126465, + "learning_rate": 1.0772357723577236e-06, + "loss": 1.398, + "step": 53 + }, + { + "epoch": 0.0274442538593482, + "grad_norm": 4.019273281097412, + "learning_rate": 1.0975609756097562e-06, + "loss": 1.2613, + "step": 54 + }, + { + "epoch": 0.027952480782669462, + "grad_norm": 4.136529922485352, + "learning_rate": 1.1178861788617887e-06, + "loss": 1.3537, + "step": 55 + }, + { + "epoch": 0.028460707705990726, + "grad_norm": 4.095795631408691, + "learning_rate": 1.1382113821138213e-06, + "loss": 1.4782, + "step": 56 + }, + { + "epoch": 0.028968934629311986, + "grad_norm": 3.8188765048980713, + "learning_rate": 1.158536585365854e-06, + "loss": 1.3162, + "step": 57 + }, + { + "epoch": 0.02947716155263325, + "grad_norm": 3.8744707107543945, + "learning_rate": 1.1788617886178863e-06, + "loss": 1.2827, + "step": 58 + }, + { + "epoch": 0.029985388475954514, + "grad_norm": 4.022250652313232, + "learning_rate": 1.1991869918699187e-06, + "loss": 1.3503, + "step": 59 + }, + { + "epoch": 0.030493615399275777, + "grad_norm": 4.049084186553955, + "learning_rate": 1.2195121951219514e-06, + "loss": 1.252, + "step": 60 + }, + { + "epoch": 0.03100184232259704, + "grad_norm": 3.750056028366089, + "learning_rate": 1.2398373983739838e-06, + "loss": 1.3227, + "step": 61 + }, + { + "epoch": 0.0315100692459183, + "grad_norm": 4.167194366455078, + "learning_rate": 1.2601626016260162e-06, + "loss": 1.3036, + "step": 62 + }, + { + "epoch": 0.03201829616923957, + "grad_norm": 3.954740285873413, + "learning_rate": 1.2804878048780488e-06, + "loss": 1.2946, + "step": 63 + }, + { + "epoch": 0.03252652309256083, + "grad_norm": 4.393954753875732, + "learning_rate": 1.3008130081300815e-06, + "loss": 1.3785, + "step": 64 + }, + { + "epoch": 0.03303475001588209, + "grad_norm": 3.7162604331970215, + "learning_rate": 1.3211382113821139e-06, + "loss": 1.3086, + "step": 65 + }, + { + "epoch": 0.033542976939203356, + "grad_norm": 3.7479500770568848, + "learning_rate": 1.3414634146341465e-06, + "loss": 1.3727, + "step": 66 + }, + { + "epoch": 0.034051203862524616, + "grad_norm": 3.585484504699707, + "learning_rate": 1.361788617886179e-06, + "loss": 1.3153, + "step": 67 + }, + { + "epoch": 0.034559430785845884, + "grad_norm": 3.7799341678619385, + "learning_rate": 1.3821138211382116e-06, + "loss": 1.2355, + "step": 68 + }, + { + "epoch": 0.035067657709167144, + "grad_norm": 4.035519123077393, + "learning_rate": 1.4024390243902442e-06, + "loss": 1.3052, + "step": 69 + }, + { + "epoch": 0.035575884632488404, + "grad_norm": 3.966735363006592, + "learning_rate": 1.4227642276422766e-06, + "loss": 1.3895, + "step": 70 + }, + { + "epoch": 0.03608411155580967, + "grad_norm": 3.9452250003814697, + "learning_rate": 1.4430894308943092e-06, + "loss": 1.3275, + "step": 71 + }, + { + "epoch": 0.03659233847913093, + "grad_norm": 4.105930328369141, + "learning_rate": 1.4634146341463414e-06, + "loss": 1.4562, + "step": 72 + }, + { + "epoch": 0.03710056540245219, + "grad_norm": 3.8830127716064453, + "learning_rate": 1.483739837398374e-06, + "loss": 1.252, + "step": 73 + }, + { + "epoch": 0.03760879232577346, + "grad_norm": 4.440551280975342, + "learning_rate": 1.5040650406504067e-06, + "loss": 1.3924, + "step": 74 + }, + { + "epoch": 0.03811701924909472, + "grad_norm": 3.8785653114318848, + "learning_rate": 1.5243902439024391e-06, + "loss": 1.3019, + "step": 75 + }, + { + "epoch": 0.038625246172415986, + "grad_norm": 3.895341396331787, + "learning_rate": 1.5447154471544717e-06, + "loss": 1.2417, + "step": 76 + }, + { + "epoch": 0.039133473095737246, + "grad_norm": 3.4419727325439453, + "learning_rate": 1.5650406504065042e-06, + "loss": 1.2863, + "step": 77 + }, + { + "epoch": 0.03964170001905851, + "grad_norm": 3.9680559635162354, + "learning_rate": 1.5853658536585368e-06, + "loss": 1.3943, + "step": 78 + }, + { + "epoch": 0.040149926942379774, + "grad_norm": 3.7686707973480225, + "learning_rate": 1.6056910569105694e-06, + "loss": 1.3998, + "step": 79 + }, + { + "epoch": 0.040658153865701034, + "grad_norm": 4.245886325836182, + "learning_rate": 1.6260162601626018e-06, + "loss": 1.4582, + "step": 80 + }, + { + "epoch": 0.0411663807890223, + "grad_norm": 3.924715518951416, + "learning_rate": 1.6463414634146345e-06, + "loss": 1.3373, + "step": 81 + }, + { + "epoch": 0.04167460771234356, + "grad_norm": 4.548923969268799, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.2625, + "step": 82 + }, + { + "epoch": 0.04218283463566482, + "grad_norm": 4.1088714599609375, + "learning_rate": 1.6869918699186993e-06, + "loss": 1.3832, + "step": 83 + }, + { + "epoch": 0.04269106155898609, + "grad_norm": 3.9086315631866455, + "learning_rate": 1.707317073170732e-06, + "loss": 1.3633, + "step": 84 + }, + { + "epoch": 0.04319928848230735, + "grad_norm": 4.148958683013916, + "learning_rate": 1.7276422764227643e-06, + "loss": 1.2266, + "step": 85 + }, + { + "epoch": 0.043707515405628616, + "grad_norm": 3.861931562423706, + "learning_rate": 1.747967479674797e-06, + "loss": 1.4014, + "step": 86 + }, + { + "epoch": 0.04421574232894988, + "grad_norm": 4.312771320343018, + "learning_rate": 1.7682926829268294e-06, + "loss": 1.3073, + "step": 87 + }, + { + "epoch": 0.04472396925227114, + "grad_norm": 3.94911789894104, + "learning_rate": 1.788617886178862e-06, + "loss": 1.4017, + "step": 88 + }, + { + "epoch": 0.045232196175592404, + "grad_norm": 3.828352212905884, + "learning_rate": 1.8089430894308946e-06, + "loss": 1.238, + "step": 89 + }, + { + "epoch": 0.045740423098913664, + "grad_norm": 3.622032403945923, + "learning_rate": 1.8292682926829268e-06, + "loss": 1.275, + "step": 90 + }, + { + "epoch": 0.046248650022234924, + "grad_norm": 3.982901096343994, + "learning_rate": 1.8495934959349595e-06, + "loss": 1.247, + "step": 91 + }, + { + "epoch": 0.04675687694555619, + "grad_norm": 3.9050590991973877, + "learning_rate": 1.8699186991869919e-06, + "loss": 1.2841, + "step": 92 + }, + { + "epoch": 0.04726510386887745, + "grad_norm": 3.8051700592041016, + "learning_rate": 1.8902439024390245e-06, + "loss": 1.3774, + "step": 93 + }, + { + "epoch": 0.04777333079219872, + "grad_norm": 3.988053798675537, + "learning_rate": 1.9105691056910574e-06, + "loss": 1.3044, + "step": 94 + }, + { + "epoch": 0.04828155771551998, + "grad_norm": 4.018758296966553, + "learning_rate": 1.9308943089430896e-06, + "loss": 1.2674, + "step": 95 + }, + { + "epoch": 0.04878978463884124, + "grad_norm": 3.703763723373413, + "learning_rate": 1.951219512195122e-06, + "loss": 1.4012, + "step": 96 + }, + { + "epoch": 0.04929801156216251, + "grad_norm": 4.037637710571289, + "learning_rate": 1.9715447154471544e-06, + "loss": 1.3216, + "step": 97 + }, + { + "epoch": 0.04980623848548377, + "grad_norm": 3.6200430393218994, + "learning_rate": 1.991869918699187e-06, + "loss": 1.1986, + "step": 98 + }, + { + "epoch": 0.050314465408805034, + "grad_norm": 5.854780673980713, + "learning_rate": 2.0121951219512197e-06, + "loss": 1.4021, + "step": 99 + }, + { + "epoch": 0.050822692332126294, + "grad_norm": 4.096163272857666, + "learning_rate": 2.0325203252032523e-06, + "loss": 1.3754, + "step": 100 + }, + { + "epoch": 0.051330919255447555, + "grad_norm": 3.9238216876983643, + "learning_rate": 2.052845528455285e-06, + "loss": 1.3719, + "step": 101 + }, + { + "epoch": 0.05183914617876882, + "grad_norm": 3.885479211807251, + "learning_rate": 2.073170731707317e-06, + "loss": 1.3589, + "step": 102 + }, + { + "epoch": 0.05234737310209008, + "grad_norm": 3.7331907749176025, + "learning_rate": 2.0934959349593497e-06, + "loss": 1.3464, + "step": 103 + }, + { + "epoch": 0.05285560002541135, + "grad_norm": 3.8253138065338135, + "learning_rate": 2.1138211382113824e-06, + "loss": 1.4048, + "step": 104 + }, + { + "epoch": 0.05336382694873261, + "grad_norm": 4.024075984954834, + "learning_rate": 2.1341463414634146e-06, + "loss": 1.3333, + "step": 105 + }, + { + "epoch": 0.05387205387205387, + "grad_norm": 4.16942834854126, + "learning_rate": 2.154471544715447e-06, + "loss": 1.3049, + "step": 106 + }, + { + "epoch": 0.05438028079537514, + "grad_norm": 3.7079477310180664, + "learning_rate": 2.17479674796748e-06, + "loss": 1.2983, + "step": 107 + }, + { + "epoch": 0.0548885077186964, + "grad_norm": 4.08198881149292, + "learning_rate": 2.1951219512195125e-06, + "loss": 1.2067, + "step": 108 + }, + { + "epoch": 0.055396734642017664, + "grad_norm": 4.052254676818848, + "learning_rate": 2.215447154471545e-06, + "loss": 1.3061, + "step": 109 + }, + { + "epoch": 0.055904961565338925, + "grad_norm": 4.361356735229492, + "learning_rate": 2.2357723577235773e-06, + "loss": 1.3899, + "step": 110 + }, + { + "epoch": 0.056413188488660185, + "grad_norm": 8.015365600585938, + "learning_rate": 2.25609756097561e-06, + "loss": 1.3209, + "step": 111 + }, + { + "epoch": 0.05692141541198145, + "grad_norm": 3.764535665512085, + "learning_rate": 2.2764227642276426e-06, + "loss": 1.287, + "step": 112 + }, + { + "epoch": 0.05742964233530271, + "grad_norm": 5.49539852142334, + "learning_rate": 2.296747967479675e-06, + "loss": 1.3783, + "step": 113 + }, + { + "epoch": 0.05793786925862397, + "grad_norm": 3.8290023803710938, + "learning_rate": 2.317073170731708e-06, + "loss": 1.234, + "step": 114 + }, + { + "epoch": 0.05844609618194524, + "grad_norm": 4.1116228103637695, + "learning_rate": 2.33739837398374e-06, + "loss": 1.3752, + "step": 115 + }, + { + "epoch": 0.0589543231052665, + "grad_norm": 4.267752170562744, + "learning_rate": 2.3577235772357727e-06, + "loss": 1.3222, + "step": 116 + }, + { + "epoch": 0.05946255002858777, + "grad_norm": 3.951112985610962, + "learning_rate": 2.378048780487805e-06, + "loss": 1.3798, + "step": 117 + }, + { + "epoch": 0.05997077695190903, + "grad_norm": 3.748058319091797, + "learning_rate": 2.3983739837398375e-06, + "loss": 1.2211, + "step": 118 + }, + { + "epoch": 0.06047900387523029, + "grad_norm": 3.887105941772461, + "learning_rate": 2.41869918699187e-06, + "loss": 1.2549, + "step": 119 + }, + { + "epoch": 0.060987230798551555, + "grad_norm": 3.793177843093872, + "learning_rate": 2.4390243902439027e-06, + "loss": 1.3849, + "step": 120 + }, + { + "epoch": 0.061495457721872815, + "grad_norm": 4.098204612731934, + "learning_rate": 2.4593495934959354e-06, + "loss": 1.3509, + "step": 121 + }, + { + "epoch": 0.06200368464519408, + "grad_norm": 3.8322818279266357, + "learning_rate": 2.4796747967479676e-06, + "loss": 1.1903, + "step": 122 + }, + { + "epoch": 0.06251191156851534, + "grad_norm": 4.026457786560059, + "learning_rate": 2.5e-06, + "loss": 1.2147, + "step": 123 + }, + { + "epoch": 0.0630201384918366, + "grad_norm": 3.7052459716796875, + "learning_rate": 2.5203252032520324e-06, + "loss": 1.398, + "step": 124 + }, + { + "epoch": 0.06352836541515787, + "grad_norm": 3.5341570377349854, + "learning_rate": 2.5406504065040655e-06, + "loss": 1.2919, + "step": 125 + }, + { + "epoch": 0.06403659233847914, + "grad_norm": 4.211786270141602, + "learning_rate": 2.5609756097560977e-06, + "loss": 1.1977, + "step": 126 + }, + { + "epoch": 0.06454481926180039, + "grad_norm": 3.801708221435547, + "learning_rate": 2.5813008130081303e-06, + "loss": 1.2276, + "step": 127 + }, + { + "epoch": 0.06505304618512166, + "grad_norm": 4.580326557159424, + "learning_rate": 2.601626016260163e-06, + "loss": 1.3152, + "step": 128 + }, + { + "epoch": 0.06556127310844292, + "grad_norm": 3.78059720993042, + "learning_rate": 2.6219512195121956e-06, + "loss": 1.2336, + "step": 129 + }, + { + "epoch": 0.06606950003176418, + "grad_norm": 4.220641136169434, + "learning_rate": 2.6422764227642278e-06, + "loss": 1.3903, + "step": 130 + }, + { + "epoch": 0.06657772695508545, + "grad_norm": 3.944988965988159, + "learning_rate": 2.66260162601626e-06, + "loss": 1.319, + "step": 131 + }, + { + "epoch": 0.06708595387840671, + "grad_norm": 4.109734535217285, + "learning_rate": 2.682926829268293e-06, + "loss": 1.2436, + "step": 132 + }, + { + "epoch": 0.06759418080172797, + "grad_norm": 3.725135326385498, + "learning_rate": 2.7032520325203252e-06, + "loss": 1.3013, + "step": 133 + }, + { + "epoch": 0.06810240772504923, + "grad_norm": 4.149574279785156, + "learning_rate": 2.723577235772358e-06, + "loss": 1.3835, + "step": 134 + }, + { + "epoch": 0.0686106346483705, + "grad_norm": 3.8214473724365234, + "learning_rate": 2.7439024390243905e-06, + "loss": 1.3422, + "step": 135 + }, + { + "epoch": 0.06911886157169177, + "grad_norm": 3.678873300552368, + "learning_rate": 2.764227642276423e-06, + "loss": 1.1785, + "step": 136 + }, + { + "epoch": 0.06962708849501302, + "grad_norm": 4.062511444091797, + "learning_rate": 2.7845528455284553e-06, + "loss": 1.2874, + "step": 137 + }, + { + "epoch": 0.07013531541833429, + "grad_norm": 3.8361012935638428, + "learning_rate": 2.8048780487804884e-06, + "loss": 1.3022, + "step": 138 + }, + { + "epoch": 0.07064354234165555, + "grad_norm": 4.04416561126709, + "learning_rate": 2.8252032520325206e-06, + "loss": 1.3684, + "step": 139 + }, + { + "epoch": 0.07115176926497681, + "grad_norm": 4.1772894859313965, + "learning_rate": 2.845528455284553e-06, + "loss": 1.3542, + "step": 140 + }, + { + "epoch": 0.07165999618829808, + "grad_norm": 3.7365682125091553, + "learning_rate": 2.8658536585365854e-06, + "loss": 1.3469, + "step": 141 + }, + { + "epoch": 0.07216822311161934, + "grad_norm": 3.7443156242370605, + "learning_rate": 2.8861788617886185e-06, + "loss": 1.3453, + "step": 142 + }, + { + "epoch": 0.0726764500349406, + "grad_norm": 3.999711513519287, + "learning_rate": 2.9065040650406507e-06, + "loss": 1.4442, + "step": 143 + }, + { + "epoch": 0.07318467695826186, + "grad_norm": 3.5781519412994385, + "learning_rate": 2.926829268292683e-06, + "loss": 1.2533, + "step": 144 + }, + { + "epoch": 0.07369290388158313, + "grad_norm": 3.80576491355896, + "learning_rate": 2.947154471544716e-06, + "loss": 1.2788, + "step": 145 + }, + { + "epoch": 0.07420113080490438, + "grad_norm": 4.316473960876465, + "learning_rate": 2.967479674796748e-06, + "loss": 1.2272, + "step": 146 + }, + { + "epoch": 0.07470935772822565, + "grad_norm": 4.160771369934082, + "learning_rate": 2.9878048780487808e-06, + "loss": 1.2916, + "step": 147 + }, + { + "epoch": 0.07521758465154692, + "grad_norm": 3.7304327487945557, + "learning_rate": 3.0081300813008134e-06, + "loss": 1.2154, + "step": 148 + }, + { + "epoch": 0.07572581157486818, + "grad_norm": 5.959589958190918, + "learning_rate": 3.028455284552846e-06, + "loss": 1.4461, + "step": 149 + }, + { + "epoch": 0.07623403849818944, + "grad_norm": 3.827523708343506, + "learning_rate": 3.0487804878048782e-06, + "loss": 1.329, + "step": 150 + }, + { + "epoch": 0.0767422654215107, + "grad_norm": 3.866091728210449, + "learning_rate": 3.0691056910569104e-06, + "loss": 1.2627, + "step": 151 + }, + { + "epoch": 0.07725049234483197, + "grad_norm": 3.7172887325286865, + "learning_rate": 3.0894308943089435e-06, + "loss": 1.4103, + "step": 152 + }, + { + "epoch": 0.07775871926815323, + "grad_norm": 4.245830535888672, + "learning_rate": 3.1097560975609757e-06, + "loss": 1.3797, + "step": 153 + }, + { + "epoch": 0.07826694619147449, + "grad_norm": 4.362545490264893, + "learning_rate": 3.1300813008130083e-06, + "loss": 1.3229, + "step": 154 + }, + { + "epoch": 0.07877517311479576, + "grad_norm": 3.8218653202056885, + "learning_rate": 3.150406504065041e-06, + "loss": 1.1794, + "step": 155 + }, + { + "epoch": 0.07928340003811701, + "grad_norm": 3.770843267440796, + "learning_rate": 3.1707317073170736e-06, + "loss": 1.2591, + "step": 156 + }, + { + "epoch": 0.07979162696143828, + "grad_norm": 3.6830074787139893, + "learning_rate": 3.1910569105691058e-06, + "loss": 1.2592, + "step": 157 + }, + { + "epoch": 0.08029985388475955, + "grad_norm": 4.0969367027282715, + "learning_rate": 3.211382113821139e-06, + "loss": 1.2888, + "step": 158 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 4.271267890930176, + "learning_rate": 3.231707317073171e-06, + "loss": 1.3786, + "step": 159 + }, + { + "epoch": 0.08131630773140207, + "grad_norm": 3.965411424636841, + "learning_rate": 3.2520325203252037e-06, + "loss": 1.2607, + "step": 160 + }, + { + "epoch": 0.08182453465472334, + "grad_norm": 3.780172824859619, + "learning_rate": 3.272357723577236e-06, + "loss": 1.2708, + "step": 161 + }, + { + "epoch": 0.0823327615780446, + "grad_norm": 3.947627305984497, + "learning_rate": 3.292682926829269e-06, + "loss": 1.4423, + "step": 162 + }, + { + "epoch": 0.08284098850136586, + "grad_norm": 3.788705348968506, + "learning_rate": 3.313008130081301e-06, + "loss": 1.2629, + "step": 163 + }, + { + "epoch": 0.08334921542468712, + "grad_norm": 4.064167499542236, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.3003, + "step": 164 + }, + { + "epoch": 0.08385744234800839, + "grad_norm": 3.8234219551086426, + "learning_rate": 3.3536585365853664e-06, + "loss": 1.2796, + "step": 165 + }, + { + "epoch": 0.08436566927132964, + "grad_norm": 3.8122544288635254, + "learning_rate": 3.3739837398373986e-06, + "loss": 1.2614, + "step": 166 + }, + { + "epoch": 0.08487389619465091, + "grad_norm": 3.916015863418579, + "learning_rate": 3.394308943089431e-06, + "loss": 1.2777, + "step": 167 + }, + { + "epoch": 0.08538212311797218, + "grad_norm": 3.9047353267669678, + "learning_rate": 3.414634146341464e-06, + "loss": 1.251, + "step": 168 + }, + { + "epoch": 0.08589035004129343, + "grad_norm": 3.993406057357788, + "learning_rate": 3.4349593495934965e-06, + "loss": 1.3075, + "step": 169 + }, + { + "epoch": 0.0863985769646147, + "grad_norm": 3.906684160232544, + "learning_rate": 3.4552845528455287e-06, + "loss": 1.2627, + "step": 170 + }, + { + "epoch": 0.08690680388793597, + "grad_norm": 4.104040622711182, + "learning_rate": 3.475609756097561e-06, + "loss": 1.2762, + "step": 171 + }, + { + "epoch": 0.08741503081125723, + "grad_norm": 3.6508748531341553, + "learning_rate": 3.495934959349594e-06, + "loss": 1.1899, + "step": 172 + }, + { + "epoch": 0.08792325773457849, + "grad_norm": 3.970284938812256, + "learning_rate": 3.516260162601626e-06, + "loss": 1.2013, + "step": 173 + }, + { + "epoch": 0.08843148465789975, + "grad_norm": 3.715240001678467, + "learning_rate": 3.5365853658536588e-06, + "loss": 1.2735, + "step": 174 + }, + { + "epoch": 0.08893971158122102, + "grad_norm": 3.685577392578125, + "learning_rate": 3.5569105691056914e-06, + "loss": 1.21, + "step": 175 + }, + { + "epoch": 0.08944793850454227, + "grad_norm": 3.7775447368621826, + "learning_rate": 3.577235772357724e-06, + "loss": 1.2972, + "step": 176 + }, + { + "epoch": 0.08995616542786354, + "grad_norm": 3.7754499912261963, + "learning_rate": 3.5975609756097562e-06, + "loss": 1.1667, + "step": 177 + }, + { + "epoch": 0.09046439235118481, + "grad_norm": 11.866535186767578, + "learning_rate": 3.6178861788617893e-06, + "loss": 1.5132, + "step": 178 + }, + { + "epoch": 0.09097261927450606, + "grad_norm": 3.855421781539917, + "learning_rate": 3.6382113821138215e-06, + "loss": 1.3445, + "step": 179 + }, + { + "epoch": 0.09148084619782733, + "grad_norm": 4.019442558288574, + "learning_rate": 3.6585365853658537e-06, + "loss": 1.2539, + "step": 180 + }, + { + "epoch": 0.0919890731211486, + "grad_norm": 4.017965316772461, + "learning_rate": 3.6788617886178863e-06, + "loss": 1.2669, + "step": 181 + }, + { + "epoch": 0.09249730004446985, + "grad_norm": 3.872027635574341, + "learning_rate": 3.699186991869919e-06, + "loss": 1.2374, + "step": 182 + }, + { + "epoch": 0.09300552696779112, + "grad_norm": 4.099319934844971, + "learning_rate": 3.7195121951219516e-06, + "loss": 1.3732, + "step": 183 + }, + { + "epoch": 0.09351375389111238, + "grad_norm": 3.8168752193450928, + "learning_rate": 3.7398373983739838e-06, + "loss": 1.3192, + "step": 184 + }, + { + "epoch": 0.09402198081443365, + "grad_norm": 3.548044443130493, + "learning_rate": 3.760162601626017e-06, + "loss": 1.2726, + "step": 185 + }, + { + "epoch": 0.0945302077377549, + "grad_norm": 3.644498109817505, + "learning_rate": 3.780487804878049e-06, + "loss": 1.2598, + "step": 186 + }, + { + "epoch": 0.09503843466107617, + "grad_norm": 4.000254154205322, + "learning_rate": 3.8008130081300817e-06, + "loss": 1.3566, + "step": 187 + }, + { + "epoch": 0.09554666158439744, + "grad_norm": 3.4733471870422363, + "learning_rate": 3.821138211382115e-06, + "loss": 1.1885, + "step": 188 + }, + { + "epoch": 0.09605488850771869, + "grad_norm": 3.7947239875793457, + "learning_rate": 3.8414634146341465e-06, + "loss": 1.3288, + "step": 189 + }, + { + "epoch": 0.09656311543103996, + "grad_norm": 3.94771409034729, + "learning_rate": 3.861788617886179e-06, + "loss": 1.3124, + "step": 190 + }, + { + "epoch": 0.09707134235436123, + "grad_norm": 4.032608509063721, + "learning_rate": 3.882113821138212e-06, + "loss": 1.236, + "step": 191 + }, + { + "epoch": 0.09757956927768248, + "grad_norm": 3.6716253757476807, + "learning_rate": 3.902439024390244e-06, + "loss": 1.2821, + "step": 192 + }, + { + "epoch": 0.09808779620100375, + "grad_norm": 3.8969194889068604, + "learning_rate": 3.922764227642277e-06, + "loss": 1.3023, + "step": 193 + }, + { + "epoch": 0.09859602312432501, + "grad_norm": 4.0722975730896, + "learning_rate": 3.943089430894309e-06, + "loss": 1.3167, + "step": 194 + }, + { + "epoch": 0.09910425004764628, + "grad_norm": 3.9485273361206055, + "learning_rate": 3.963414634146342e-06, + "loss": 1.2637, + "step": 195 + }, + { + "epoch": 0.09961247697096753, + "grad_norm": 3.7706732749938965, + "learning_rate": 3.983739837398374e-06, + "loss": 1.2213, + "step": 196 + }, + { + "epoch": 0.1001207038942888, + "grad_norm": 3.6940486431121826, + "learning_rate": 4.004065040650407e-06, + "loss": 1.2903, + "step": 197 + }, + { + "epoch": 0.10062893081761007, + "grad_norm": 3.6795332431793213, + "learning_rate": 4.024390243902439e-06, + "loss": 1.2003, + "step": 198 + }, + { + "epoch": 0.10113715774093132, + "grad_norm": 3.8393092155456543, + "learning_rate": 4.044715447154472e-06, + "loss": 1.352, + "step": 199 + }, + { + "epoch": 0.10164538466425259, + "grad_norm": 3.8912806510925293, + "learning_rate": 4.0650406504065046e-06, + "loss": 1.2611, + "step": 200 + }, + { + "epoch": 0.10215361158757386, + "grad_norm": 3.9540915489196777, + "learning_rate": 4.085365853658536e-06, + "loss": 1.2613, + "step": 201 + }, + { + "epoch": 0.10266183851089511, + "grad_norm": 3.922166585922241, + "learning_rate": 4.10569105691057e-06, + "loss": 1.3061, + "step": 202 + }, + { + "epoch": 0.10317006543421638, + "grad_norm": 4.365126609802246, + "learning_rate": 4.126016260162602e-06, + "loss": 1.3791, + "step": 203 + }, + { + "epoch": 0.10367829235753764, + "grad_norm": 3.6724672317504883, + "learning_rate": 4.146341463414634e-06, + "loss": 1.1408, + "step": 204 + }, + { + "epoch": 0.1041865192808589, + "grad_norm": 3.7531189918518066, + "learning_rate": 4.166666666666667e-06, + "loss": 1.276, + "step": 205 + }, + { + "epoch": 0.10469474620418016, + "grad_norm": 3.5939886569976807, + "learning_rate": 4.1869918699186995e-06, + "loss": 1.1531, + "step": 206 + }, + { + "epoch": 0.10520297312750143, + "grad_norm": 3.8948142528533936, + "learning_rate": 4.207317073170732e-06, + "loss": 1.2804, + "step": 207 + }, + { + "epoch": 0.1057112000508227, + "grad_norm": 3.7475123405456543, + "learning_rate": 4.227642276422765e-06, + "loss": 1.2897, + "step": 208 + }, + { + "epoch": 0.10621942697414395, + "grad_norm": 4.131088733673096, + "learning_rate": 4.247967479674797e-06, + "loss": 1.2971, + "step": 209 + }, + { + "epoch": 0.10672765389746522, + "grad_norm": 3.6580843925476074, + "learning_rate": 4.268292682926829e-06, + "loss": 1.1813, + "step": 210 + }, + { + "epoch": 0.10723588082078649, + "grad_norm": 12.907022476196289, + "learning_rate": 4.288617886178862e-06, + "loss": 1.434, + "step": 211 + }, + { + "epoch": 0.10774410774410774, + "grad_norm": 4.026226043701172, + "learning_rate": 4.308943089430894e-06, + "loss": 1.3102, + "step": 212 + }, + { + "epoch": 0.108252334667429, + "grad_norm": 3.583810567855835, + "learning_rate": 4.329268292682927e-06, + "loss": 1.2323, + "step": 213 + }, + { + "epoch": 0.10876056159075027, + "grad_norm": 3.931403636932373, + "learning_rate": 4.34959349593496e-06, + "loss": 1.2023, + "step": 214 + }, + { + "epoch": 0.10926878851407153, + "grad_norm": 3.6533145904541016, + "learning_rate": 4.369918699186992e-06, + "loss": 1.1653, + "step": 215 + }, + { + "epoch": 0.1097770154373928, + "grad_norm": 3.740746259689331, + "learning_rate": 4.390243902439025e-06, + "loss": 1.2121, + "step": 216 + }, + { + "epoch": 0.11028524236071406, + "grad_norm": 3.658018112182617, + "learning_rate": 4.410569105691057e-06, + "loss": 1.2733, + "step": 217 + }, + { + "epoch": 0.11079346928403533, + "grad_norm": 3.9621124267578125, + "learning_rate": 4.43089430894309e-06, + "loss": 1.1794, + "step": 218 + }, + { + "epoch": 0.11130169620735658, + "grad_norm": 3.379032850265503, + "learning_rate": 4.451219512195122e-06, + "loss": 1.2119, + "step": 219 + }, + { + "epoch": 0.11180992313067785, + "grad_norm": 3.9364140033721924, + "learning_rate": 4.471544715447155e-06, + "loss": 1.3891, + "step": 220 + }, + { + "epoch": 0.11231815005399912, + "grad_norm": 3.717283248901367, + "learning_rate": 4.491869918699187e-06, + "loss": 1.2106, + "step": 221 + }, + { + "epoch": 0.11282637697732037, + "grad_norm": 4.216766834259033, + "learning_rate": 4.51219512195122e-06, + "loss": 1.3475, + "step": 222 + }, + { + "epoch": 0.11333460390064164, + "grad_norm": 3.6524863243103027, + "learning_rate": 4.5325203252032525e-06, + "loss": 1.3016, + "step": 223 + }, + { + "epoch": 0.1138428308239629, + "grad_norm": 4.263420581817627, + "learning_rate": 4.552845528455285e-06, + "loss": 1.2905, + "step": 224 + }, + { + "epoch": 0.11435105774728416, + "grad_norm": 3.6008975505828857, + "learning_rate": 4.573170731707318e-06, + "loss": 1.2788, + "step": 225 + }, + { + "epoch": 0.11485928467060542, + "grad_norm": 3.713282823562622, + "learning_rate": 4.59349593495935e-06, + "loss": 1.277, + "step": 226 + }, + { + "epoch": 0.11536751159392669, + "grad_norm": 3.635056495666504, + "learning_rate": 4.613821138211382e-06, + "loss": 1.2814, + "step": 227 + }, + { + "epoch": 0.11587573851724794, + "grad_norm": 3.731588840484619, + "learning_rate": 4.634146341463416e-06, + "loss": 1.3636, + "step": 228 + }, + { + "epoch": 0.11638396544056921, + "grad_norm": 4.0097198486328125, + "learning_rate": 4.654471544715447e-06, + "loss": 1.2493, + "step": 229 + }, + { + "epoch": 0.11689219236389048, + "grad_norm": 4.035277843475342, + "learning_rate": 4.67479674796748e-06, + "loss": 1.2638, + "step": 230 + }, + { + "epoch": 0.11740041928721175, + "grad_norm": 3.686882972717285, + "learning_rate": 4.695121951219513e-06, + "loss": 1.2817, + "step": 231 + }, + { + "epoch": 0.117908646210533, + "grad_norm": 3.8758201599121094, + "learning_rate": 4.715447154471545e-06, + "loss": 1.2463, + "step": 232 + }, + { + "epoch": 0.11841687313385427, + "grad_norm": 4.043292045593262, + "learning_rate": 4.735772357723578e-06, + "loss": 1.2911, + "step": 233 + }, + { + "epoch": 0.11892510005717553, + "grad_norm": 3.9729626178741455, + "learning_rate": 4.75609756097561e-06, + "loss": 1.313, + "step": 234 + }, + { + "epoch": 0.11943332698049679, + "grad_norm": 3.574331521987915, + "learning_rate": 4.776422764227643e-06, + "loss": 1.3961, + "step": 235 + }, + { + "epoch": 0.11994155390381805, + "grad_norm": 4.03476619720459, + "learning_rate": 4.796747967479675e-06, + "loss": 1.2868, + "step": 236 + }, + { + "epoch": 0.12044978082713932, + "grad_norm": 3.672788381576538, + "learning_rate": 4.817073170731708e-06, + "loss": 1.3771, + "step": 237 + }, + { + "epoch": 0.12095800775046057, + "grad_norm": 4.011895179748535, + "learning_rate": 4.83739837398374e-06, + "loss": 1.2618, + "step": 238 + }, + { + "epoch": 0.12146623467378184, + "grad_norm": 3.7192506790161133, + "learning_rate": 4.857723577235773e-06, + "loss": 1.3259, + "step": 239 + }, + { + "epoch": 0.12197446159710311, + "grad_norm": 3.3653564453125, + "learning_rate": 4.8780487804878055e-06, + "loss": 1.2904, + "step": 240 + }, + { + "epoch": 0.12248268852042436, + "grad_norm": 3.636655330657959, + "learning_rate": 4.898373983739837e-06, + "loss": 1.3524, + "step": 241 + }, + { + "epoch": 0.12299091544374563, + "grad_norm": 4.0803446769714355, + "learning_rate": 4.918699186991871e-06, + "loss": 1.3442, + "step": 242 + }, + { + "epoch": 0.1234991423670669, + "grad_norm": 3.5182483196258545, + "learning_rate": 4.9390243902439025e-06, + "loss": 1.2444, + "step": 243 + }, + { + "epoch": 0.12400736929038816, + "grad_norm": 3.481665849685669, + "learning_rate": 4.959349593495935e-06, + "loss": 1.181, + "step": 244 + }, + { + "epoch": 0.12451559621370942, + "grad_norm": 3.4673781394958496, + "learning_rate": 4.979674796747968e-06, + "loss": 1.3207, + "step": 245 + }, + { + "epoch": 0.12502382313703067, + "grad_norm": 3.4575881958007812, + "learning_rate": 5e-06, + "loss": 1.3064, + "step": 246 + }, + { + "epoch": 0.12553205006035195, + "grad_norm": 4.137662887573242, + "learning_rate": 5.020325203252033e-06, + "loss": 1.2268, + "step": 247 + }, + { + "epoch": 0.1260402769836732, + "grad_norm": 3.655907392501831, + "learning_rate": 5.040650406504065e-06, + "loss": 1.3024, + "step": 248 + }, + { + "epoch": 0.1265485039069945, + "grad_norm": 8.318976402282715, + "learning_rate": 5.060975609756098e-06, + "loss": 1.3418, + "step": 249 + }, + { + "epoch": 0.12705673083031574, + "grad_norm": 3.5912580490112305, + "learning_rate": 5.081300813008131e-06, + "loss": 1.2041, + "step": 250 + }, + { + "epoch": 0.127564957753637, + "grad_norm": 4.007481575012207, + "learning_rate": 5.101626016260163e-06, + "loss": 1.1676, + "step": 251 + }, + { + "epoch": 0.12807318467695827, + "grad_norm": 3.766157388687134, + "learning_rate": 5.121951219512195e-06, + "loss": 1.3185, + "step": 252 + }, + { + "epoch": 0.12858141160027953, + "grad_norm": 3.528630495071411, + "learning_rate": 5.142276422764229e-06, + "loss": 1.2942, + "step": 253 + }, + { + "epoch": 0.12908963852360078, + "grad_norm": 3.672837257385254, + "learning_rate": 5.162601626016261e-06, + "loss": 1.3008, + "step": 254 + }, + { + "epoch": 0.12959786544692206, + "grad_norm": 3.592590808868408, + "learning_rate": 5.182926829268293e-06, + "loss": 1.3084, + "step": 255 + }, + { + "epoch": 0.13010609237024331, + "grad_norm": 3.557032823562622, + "learning_rate": 5.203252032520326e-06, + "loss": 1.2775, + "step": 256 + }, + { + "epoch": 0.13061431929356457, + "grad_norm": 3.6543917655944824, + "learning_rate": 5.223577235772358e-06, + "loss": 1.3496, + "step": 257 + }, + { + "epoch": 0.13112254621688585, + "grad_norm": 3.6346216201782227, + "learning_rate": 5.243902439024391e-06, + "loss": 1.2644, + "step": 258 + }, + { + "epoch": 0.1316307731402071, + "grad_norm": 3.5259435176849365, + "learning_rate": 5.264227642276423e-06, + "loss": 1.3134, + "step": 259 + }, + { + "epoch": 0.13213900006352836, + "grad_norm": 3.558912515640259, + "learning_rate": 5.2845528455284555e-06, + "loss": 1.1762, + "step": 260 + }, + { + "epoch": 0.13264722698684964, + "grad_norm": 3.6628079414367676, + "learning_rate": 5.304878048780488e-06, + "loss": 1.3849, + "step": 261 + }, + { + "epoch": 0.1331554539101709, + "grad_norm": 3.4435086250305176, + "learning_rate": 5.32520325203252e-06, + "loss": 1.2441, + "step": 262 + }, + { + "epoch": 0.13366368083349214, + "grad_norm": 4.010739803314209, + "learning_rate": 5.345528455284553e-06, + "loss": 1.3847, + "step": 263 + }, + { + "epoch": 0.13417190775681342, + "grad_norm": 3.626926898956299, + "learning_rate": 5.365853658536586e-06, + "loss": 1.2959, + "step": 264 + }, + { + "epoch": 0.13468013468013468, + "grad_norm": 3.5818004608154297, + "learning_rate": 5.386178861788618e-06, + "loss": 1.2967, + "step": 265 + }, + { + "epoch": 0.13518836160345593, + "grad_norm": 3.964972496032715, + "learning_rate": 5.4065040650406504e-06, + "loss": 1.3061, + "step": 266 + }, + { + "epoch": 0.1356965885267772, + "grad_norm": 3.8659842014312744, + "learning_rate": 5.426829268292684e-06, + "loss": 1.3736, + "step": 267 + }, + { + "epoch": 0.13620481545009847, + "grad_norm": 3.6874732971191406, + "learning_rate": 5.447154471544716e-06, + "loss": 1.2194, + "step": 268 + }, + { + "epoch": 0.13671304237341972, + "grad_norm": 3.744476556777954, + "learning_rate": 5.467479674796748e-06, + "loss": 1.2867, + "step": 269 + }, + { + "epoch": 0.137221269296741, + "grad_norm": 3.51850962638855, + "learning_rate": 5.487804878048781e-06, + "loss": 1.2741, + "step": 270 + }, + { + "epoch": 0.13772949622006225, + "grad_norm": 3.6498262882232666, + "learning_rate": 5.508130081300814e-06, + "loss": 1.2259, + "step": 271 + }, + { + "epoch": 0.13823772314338353, + "grad_norm": 3.7769477367401123, + "learning_rate": 5.528455284552846e-06, + "loss": 1.2216, + "step": 272 + }, + { + "epoch": 0.1387459500667048, + "grad_norm": 3.5332465171813965, + "learning_rate": 5.548780487804879e-06, + "loss": 1.211, + "step": 273 + }, + { + "epoch": 0.13925417699002604, + "grad_norm": 3.7396240234375, + "learning_rate": 5.569105691056911e-06, + "loss": 1.3535, + "step": 274 + }, + { + "epoch": 0.13976240391334732, + "grad_norm": 3.5387160778045654, + "learning_rate": 5.589430894308944e-06, + "loss": 1.3375, + "step": 275 + }, + { + "epoch": 0.14027063083666858, + "grad_norm": 3.4825077056884766, + "learning_rate": 5.609756097560977e-06, + "loss": 1.3417, + "step": 276 + }, + { + "epoch": 0.14077885775998983, + "grad_norm": 3.5783963203430176, + "learning_rate": 5.6300813008130085e-06, + "loss": 1.2573, + "step": 277 + }, + { + "epoch": 0.1412870846833111, + "grad_norm": 3.5096850395202637, + "learning_rate": 5.650406504065041e-06, + "loss": 1.2363, + "step": 278 + }, + { + "epoch": 0.14179531160663236, + "grad_norm": 3.574193239212036, + "learning_rate": 5.670731707317073e-06, + "loss": 1.4032, + "step": 279 + }, + { + "epoch": 0.14230353852995362, + "grad_norm": 3.4912261962890625, + "learning_rate": 5.691056910569106e-06, + "loss": 1.2603, + "step": 280 + }, + { + "epoch": 0.1428117654532749, + "grad_norm": 3.5065510272979736, + "learning_rate": 5.711382113821139e-06, + "loss": 1.3125, + "step": 281 + }, + { + "epoch": 0.14331999237659615, + "grad_norm": 3.6454124450683594, + "learning_rate": 5.731707317073171e-06, + "loss": 1.285, + "step": 282 + }, + { + "epoch": 0.1438282192999174, + "grad_norm": 3.704364776611328, + "learning_rate": 5.7520325203252034e-06, + "loss": 1.1501, + "step": 283 + }, + { + "epoch": 0.14433644622323868, + "grad_norm": 3.756485939025879, + "learning_rate": 5.772357723577237e-06, + "loss": 1.3346, + "step": 284 + }, + { + "epoch": 0.14484467314655994, + "grad_norm": 3.815615177154541, + "learning_rate": 5.792682926829269e-06, + "loss": 1.3682, + "step": 285 + }, + { + "epoch": 0.1453529000698812, + "grad_norm": 3.9333648681640625, + "learning_rate": 5.813008130081301e-06, + "loss": 1.2763, + "step": 286 + }, + { + "epoch": 0.14586112699320247, + "grad_norm": 3.455777883529663, + "learning_rate": 5.833333333333334e-06, + "loss": 1.151, + "step": 287 + }, + { + "epoch": 0.14636935391652373, + "grad_norm": 3.815992593765259, + "learning_rate": 5.853658536585366e-06, + "loss": 1.3023, + "step": 288 + }, + { + "epoch": 0.14687758083984498, + "grad_norm": 3.914978504180908, + "learning_rate": 5.873983739837399e-06, + "loss": 1.25, + "step": 289 + }, + { + "epoch": 0.14738580776316626, + "grad_norm": 3.6481759548187256, + "learning_rate": 5.894308943089432e-06, + "loss": 1.2893, + "step": 290 + }, + { + "epoch": 0.1478940346864875, + "grad_norm": 3.5571045875549316, + "learning_rate": 5.914634146341464e-06, + "loss": 1.3232, + "step": 291 + }, + { + "epoch": 0.14840226160980877, + "grad_norm": 3.597348690032959, + "learning_rate": 5.934959349593496e-06, + "loss": 1.192, + "step": 292 + }, + { + "epoch": 0.14891048853313005, + "grad_norm": 3.44991397857666, + "learning_rate": 5.95528455284553e-06, + "loss": 1.1843, + "step": 293 + }, + { + "epoch": 0.1494187154564513, + "grad_norm": 3.8357386589050293, + "learning_rate": 5.9756097560975615e-06, + "loss": 1.2407, + "step": 294 + }, + { + "epoch": 0.14992694237977258, + "grad_norm": 3.804199457168579, + "learning_rate": 5.995934959349594e-06, + "loss": 1.2215, + "step": 295 + }, + { + "epoch": 0.15043516930309384, + "grad_norm": 3.6634774208068848, + "learning_rate": 6.016260162601627e-06, + "loss": 1.347, + "step": 296 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 3.491067886352539, + "learning_rate": 6.0365853658536585e-06, + "loss": 1.2255, + "step": 297 + }, + { + "epoch": 0.15145162314973637, + "grad_norm": 3.578895330429077, + "learning_rate": 6.056910569105692e-06, + "loss": 1.2312, + "step": 298 + }, + { + "epoch": 0.15195985007305762, + "grad_norm": 3.9656708240509033, + "learning_rate": 6.077235772357724e-06, + "loss": 1.3773, + "step": 299 + }, + { + "epoch": 0.15246807699637888, + "grad_norm": 3.67789888381958, + "learning_rate": 6.0975609756097564e-06, + "loss": 1.3023, + "step": 300 + }, + { + "epoch": 0.15297630391970016, + "grad_norm": 3.6001689434051514, + "learning_rate": 6.117886178861789e-06, + "loss": 1.2729, + "step": 301 + }, + { + "epoch": 0.1534845308430214, + "grad_norm": 3.572338581085205, + "learning_rate": 6.138211382113821e-06, + "loss": 1.3521, + "step": 302 + }, + { + "epoch": 0.15399275776634266, + "grad_norm": 3.7971441745758057, + "learning_rate": 6.158536585365854e-06, + "loss": 1.2599, + "step": 303 + }, + { + "epoch": 0.15450098468966394, + "grad_norm": 4.001463413238525, + "learning_rate": 6.178861788617887e-06, + "loss": 1.344, + "step": 304 + }, + { + "epoch": 0.1550092116129852, + "grad_norm": 3.4792215824127197, + "learning_rate": 6.199186991869919e-06, + "loss": 1.2284, + "step": 305 + }, + { + "epoch": 0.15551743853630645, + "grad_norm": 3.7361996173858643, + "learning_rate": 6.219512195121951e-06, + "loss": 1.2382, + "step": 306 + }, + { + "epoch": 0.15602566545962773, + "grad_norm": 3.6837079524993896, + "learning_rate": 6.239837398373985e-06, + "loss": 1.3571, + "step": 307 + }, + { + "epoch": 0.15653389238294899, + "grad_norm": 3.793705463409424, + "learning_rate": 6.260162601626017e-06, + "loss": 1.3289, + "step": 308 + }, + { + "epoch": 0.15704211930627024, + "grad_norm": 3.567331075668335, + "learning_rate": 6.280487804878049e-06, + "loss": 1.3228, + "step": 309 + }, + { + "epoch": 0.15755034622959152, + "grad_norm": 3.763274669647217, + "learning_rate": 6.300813008130082e-06, + "loss": 1.3429, + "step": 310 + }, + { + "epoch": 0.15805857315291277, + "grad_norm": 3.717379093170166, + "learning_rate": 6.321138211382114e-06, + "loss": 1.3641, + "step": 311 + }, + { + "epoch": 0.15856680007623403, + "grad_norm": 3.8312816619873047, + "learning_rate": 6.341463414634147e-06, + "loss": 1.3155, + "step": 312 + }, + { + "epoch": 0.1590750269995553, + "grad_norm": 3.651553153991699, + "learning_rate": 6.36178861788618e-06, + "loss": 1.2838, + "step": 313 + }, + { + "epoch": 0.15958325392287656, + "grad_norm": 3.682612895965576, + "learning_rate": 6.3821138211382115e-06, + "loss": 1.3848, + "step": 314 + }, + { + "epoch": 0.16009148084619781, + "grad_norm": 3.6725523471832275, + "learning_rate": 6.402439024390244e-06, + "loss": 1.2029, + "step": 315 + }, + { + "epoch": 0.1605997077695191, + "grad_norm": 3.7922701835632324, + "learning_rate": 6.422764227642278e-06, + "loss": 1.3111, + "step": 316 + }, + { + "epoch": 0.16110793469284035, + "grad_norm": 3.7131593227386475, + "learning_rate": 6.4430894308943094e-06, + "loss": 1.32, + "step": 317 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 3.859788656234741, + "learning_rate": 6.463414634146342e-06, + "loss": 1.3625, + "step": 318 + }, + { + "epoch": 0.16212438853948288, + "grad_norm": 3.674773693084717, + "learning_rate": 6.483739837398374e-06, + "loss": 1.2244, + "step": 319 + }, + { + "epoch": 0.16263261546280414, + "grad_norm": 3.4736006259918213, + "learning_rate": 6.504065040650407e-06, + "loss": 1.2257, + "step": 320 + }, + { + "epoch": 0.16314084238612542, + "grad_norm": 3.9480464458465576, + "learning_rate": 6.52439024390244e-06, + "loss": 1.4528, + "step": 321 + }, + { + "epoch": 0.16364906930944667, + "grad_norm": 3.6919679641723633, + "learning_rate": 6.544715447154472e-06, + "loss": 1.2453, + "step": 322 + }, + { + "epoch": 0.16415729623276792, + "grad_norm": 3.6807546615600586, + "learning_rate": 6.565040650406504e-06, + "loss": 1.2104, + "step": 323 + }, + { + "epoch": 0.1646655231560892, + "grad_norm": 3.67043137550354, + "learning_rate": 6.585365853658538e-06, + "loss": 1.3452, + "step": 324 + }, + { + "epoch": 0.16517375007941046, + "grad_norm": 3.3604013919830322, + "learning_rate": 6.60569105691057e-06, + "loss": 1.2311, + "step": 325 + }, + { + "epoch": 0.1656819770027317, + "grad_norm": 3.487772226333618, + "learning_rate": 6.626016260162602e-06, + "loss": 1.2692, + "step": 326 + }, + { + "epoch": 0.166190203926053, + "grad_norm": 3.803863286972046, + "learning_rate": 6.646341463414635e-06, + "loss": 1.4371, + "step": 327 + }, + { + "epoch": 0.16669843084937425, + "grad_norm": 3.3784923553466797, + "learning_rate": 6.666666666666667e-06, + "loss": 1.2383, + "step": 328 + }, + { + "epoch": 0.1672066577726955, + "grad_norm": 3.524672746658325, + "learning_rate": 6.6869918699187e-06, + "loss": 1.2487, + "step": 329 + }, + { + "epoch": 0.16771488469601678, + "grad_norm": 3.207425832748413, + "learning_rate": 6.707317073170733e-06, + "loss": 1.2083, + "step": 330 + }, + { + "epoch": 0.16822311161933803, + "grad_norm": 3.3784162998199463, + "learning_rate": 6.7276422764227645e-06, + "loss": 1.2829, + "step": 331 + }, + { + "epoch": 0.1687313385426593, + "grad_norm": 4.187244415283203, + "learning_rate": 6.747967479674797e-06, + "loss": 1.3114, + "step": 332 + }, + { + "epoch": 0.16923956546598057, + "grad_norm": 3.5479447841644287, + "learning_rate": 6.768292682926831e-06, + "loss": 1.2949, + "step": 333 + }, + { + "epoch": 0.16974779238930182, + "grad_norm": 3.4103052616119385, + "learning_rate": 6.788617886178862e-06, + "loss": 1.1889, + "step": 334 + }, + { + "epoch": 0.17025601931262307, + "grad_norm": 3.217073678970337, + "learning_rate": 6.808943089430895e-06, + "loss": 1.3049, + "step": 335 + }, + { + "epoch": 0.17076424623594436, + "grad_norm": 3.2264113426208496, + "learning_rate": 6.829268292682928e-06, + "loss": 1.1391, + "step": 336 + }, + { + "epoch": 0.1712724731592656, + "grad_norm": 3.488623857498169, + "learning_rate": 6.8495934959349595e-06, + "loss": 1.17, + "step": 337 + }, + { + "epoch": 0.17178070008258686, + "grad_norm": 3.76481556892395, + "learning_rate": 6.869918699186993e-06, + "loss": 1.3463, + "step": 338 + }, + { + "epoch": 0.17228892700590814, + "grad_norm": 3.5634756088256836, + "learning_rate": 6.890243902439025e-06, + "loss": 1.2973, + "step": 339 + }, + { + "epoch": 0.1727971539292294, + "grad_norm": 3.3373970985412598, + "learning_rate": 6.910569105691057e-06, + "loss": 1.2365, + "step": 340 + }, + { + "epoch": 0.17330538085255065, + "grad_norm": 3.5796754360198975, + "learning_rate": 6.93089430894309e-06, + "loss": 1.405, + "step": 341 + }, + { + "epoch": 0.17381360777587193, + "grad_norm": 3.383561849594116, + "learning_rate": 6.951219512195122e-06, + "loss": 1.1957, + "step": 342 + }, + { + "epoch": 0.17432183469919318, + "grad_norm": 3.610441207885742, + "learning_rate": 6.971544715447155e-06, + "loss": 1.2192, + "step": 343 + }, + { + "epoch": 0.17483006162251447, + "grad_norm": 3.319985866546631, + "learning_rate": 6.991869918699188e-06, + "loss": 1.2916, + "step": 344 + }, + { + "epoch": 0.17533828854583572, + "grad_norm": 3.5332345962524414, + "learning_rate": 7.01219512195122e-06, + "loss": 1.2721, + "step": 345 + }, + { + "epoch": 0.17584651546915697, + "grad_norm": 3.552676200866699, + "learning_rate": 7.032520325203252e-06, + "loss": 1.3467, + "step": 346 + }, + { + "epoch": 0.17635474239247825, + "grad_norm": 3.745915412902832, + "learning_rate": 7.052845528455286e-06, + "loss": 1.3653, + "step": 347 + }, + { + "epoch": 0.1768629693157995, + "grad_norm": 3.4070985317230225, + "learning_rate": 7.0731707317073175e-06, + "loss": 1.3137, + "step": 348 + }, + { + "epoch": 0.17737119623912076, + "grad_norm": 3.583345890045166, + "learning_rate": 7.09349593495935e-06, + "loss": 1.2447, + "step": 349 + }, + { + "epoch": 0.17787942316244204, + "grad_norm": 3.593552350997925, + "learning_rate": 7.113821138211383e-06, + "loss": 1.2614, + "step": 350 + }, + { + "epoch": 0.1783876500857633, + "grad_norm": 3.6274521350860596, + "learning_rate": 7.1341463414634146e-06, + "loss": 1.2424, + "step": 351 + }, + { + "epoch": 0.17889587700908455, + "grad_norm": 3.4343936443328857, + "learning_rate": 7.154471544715448e-06, + "loss": 1.0972, + "step": 352 + }, + { + "epoch": 0.17940410393240583, + "grad_norm": 3.4829659461975098, + "learning_rate": 7.174796747967481e-06, + "loss": 1.3234, + "step": 353 + }, + { + "epoch": 0.17991233085572708, + "grad_norm": 3.9330294132232666, + "learning_rate": 7.1951219512195125e-06, + "loss": 1.2978, + "step": 354 + }, + { + "epoch": 0.18042055777904834, + "grad_norm": 3.7791481018066406, + "learning_rate": 7.215447154471545e-06, + "loss": 1.3102, + "step": 355 + }, + { + "epoch": 0.18092878470236962, + "grad_norm": 3.5597262382507324, + "learning_rate": 7.2357723577235786e-06, + "loss": 1.3284, + "step": 356 + }, + { + "epoch": 0.18143701162569087, + "grad_norm": 3.4017419815063477, + "learning_rate": 7.25609756097561e-06, + "loss": 1.2043, + "step": 357 + }, + { + "epoch": 0.18194523854901212, + "grad_norm": 3.3661866188049316, + "learning_rate": 7.276422764227643e-06, + "loss": 1.2812, + "step": 358 + }, + { + "epoch": 0.1824534654723334, + "grad_norm": 3.6549904346466064, + "learning_rate": 7.296747967479675e-06, + "loss": 1.2439, + "step": 359 + }, + { + "epoch": 0.18296169239565466, + "grad_norm": 3.5217676162719727, + "learning_rate": 7.317073170731707e-06, + "loss": 1.2781, + "step": 360 + }, + { + "epoch": 0.1834699193189759, + "grad_norm": 4.081654071807861, + "learning_rate": 7.337398373983741e-06, + "loss": 1.2801, + "step": 361 + }, + { + "epoch": 0.1839781462422972, + "grad_norm": 4.09951114654541, + "learning_rate": 7.357723577235773e-06, + "loss": 1.3082, + "step": 362 + }, + { + "epoch": 0.18448637316561844, + "grad_norm": 3.354565382003784, + "learning_rate": 7.378048780487805e-06, + "loss": 1.2412, + "step": 363 + }, + { + "epoch": 0.1849946000889397, + "grad_norm": 3.285402297973633, + "learning_rate": 7.398373983739838e-06, + "loss": 1.1878, + "step": 364 + }, + { + "epoch": 0.18550282701226098, + "grad_norm": 4.071623802185059, + "learning_rate": 7.41869918699187e-06, + "loss": 1.4499, + "step": 365 + }, + { + "epoch": 0.18601105393558223, + "grad_norm": 3.3457748889923096, + "learning_rate": 7.439024390243903e-06, + "loss": 1.3129, + "step": 366 + }, + { + "epoch": 0.1865192808589035, + "grad_norm": 3.6435835361480713, + "learning_rate": 7.459349593495936e-06, + "loss": 1.2058, + "step": 367 + }, + { + "epoch": 0.18702750778222477, + "grad_norm": 3.8403193950653076, + "learning_rate": 7.4796747967479676e-06, + "loss": 1.3017, + "step": 368 + }, + { + "epoch": 0.18753573470554602, + "grad_norm": 3.588543653488159, + "learning_rate": 7.500000000000001e-06, + "loss": 1.2786, + "step": 369 + }, + { + "epoch": 0.1880439616288673, + "grad_norm": 3.3542251586914062, + "learning_rate": 7.520325203252034e-06, + "loss": 1.28, + "step": 370 + }, + { + "epoch": 0.18855218855218855, + "grad_norm": 3.4125912189483643, + "learning_rate": 7.5406504065040654e-06, + "loss": 1.2436, + "step": 371 + }, + { + "epoch": 0.1890604154755098, + "grad_norm": 3.2614572048187256, + "learning_rate": 7.560975609756098e-06, + "loss": 1.2692, + "step": 372 + }, + { + "epoch": 0.1895686423988311, + "grad_norm": 3.295055866241455, + "learning_rate": 7.5813008130081316e-06, + "loss": 1.2411, + "step": 373 + }, + { + "epoch": 0.19007686932215234, + "grad_norm": 3.7534825801849365, + "learning_rate": 7.601626016260163e-06, + "loss": 1.2341, + "step": 374 + }, + { + "epoch": 0.1905850962454736, + "grad_norm": 3.991771936416626, + "learning_rate": 7.621951219512196e-06, + "loss": 1.2379, + "step": 375 + }, + { + "epoch": 0.19109332316879488, + "grad_norm": 3.7469890117645264, + "learning_rate": 7.64227642276423e-06, + "loss": 1.3563, + "step": 376 + }, + { + "epoch": 0.19160155009211613, + "grad_norm": 3.7260825634002686, + "learning_rate": 7.66260162601626e-06, + "loss": 1.2481, + "step": 377 + }, + { + "epoch": 0.19210977701543738, + "grad_norm": 3.3605759143829346, + "learning_rate": 7.682926829268293e-06, + "loss": 1.2917, + "step": 378 + }, + { + "epoch": 0.19261800393875866, + "grad_norm": 4.850787162780762, + "learning_rate": 7.703252032520326e-06, + "loss": 1.4126, + "step": 379 + }, + { + "epoch": 0.19312623086207992, + "grad_norm": 3.4996542930603027, + "learning_rate": 7.723577235772358e-06, + "loss": 1.4338, + "step": 380 + }, + { + "epoch": 0.19363445778540117, + "grad_norm": 3.6611642837524414, + "learning_rate": 7.743902439024391e-06, + "loss": 1.3108, + "step": 381 + }, + { + "epoch": 0.19414268470872245, + "grad_norm": 3.5380356311798096, + "learning_rate": 7.764227642276424e-06, + "loss": 1.3453, + "step": 382 + }, + { + "epoch": 0.1946509116320437, + "grad_norm": 3.764770984649658, + "learning_rate": 7.784552845528456e-06, + "loss": 1.2773, + "step": 383 + }, + { + "epoch": 0.19515913855536496, + "grad_norm": 3.463135004043579, + "learning_rate": 7.804878048780489e-06, + "loss": 1.314, + "step": 384 + }, + { + "epoch": 0.19566736547868624, + "grad_norm": 3.4924633502960205, + "learning_rate": 7.82520325203252e-06, + "loss": 1.3208, + "step": 385 + }, + { + "epoch": 0.1961755924020075, + "grad_norm": 3.3984928131103516, + "learning_rate": 7.845528455284554e-06, + "loss": 1.2752, + "step": 386 + }, + { + "epoch": 0.19668381932532875, + "grad_norm": 3.5272583961486816, + "learning_rate": 7.865853658536587e-06, + "loss": 1.2225, + "step": 387 + }, + { + "epoch": 0.19719204624865003, + "grad_norm": 3.674283027648926, + "learning_rate": 7.886178861788618e-06, + "loss": 1.2883, + "step": 388 + }, + { + "epoch": 0.19770027317197128, + "grad_norm": 3.394155263900757, + "learning_rate": 7.90650406504065e-06, + "loss": 1.3093, + "step": 389 + }, + { + "epoch": 0.19820850009529256, + "grad_norm": 3.619893789291382, + "learning_rate": 7.926829268292685e-06, + "loss": 1.2639, + "step": 390 + }, + { + "epoch": 0.19871672701861381, + "grad_norm": 3.583444833755493, + "learning_rate": 7.947154471544715e-06, + "loss": 1.2722, + "step": 391 + }, + { + "epoch": 0.19922495394193507, + "grad_norm": 3.5035605430603027, + "learning_rate": 7.967479674796748e-06, + "loss": 1.3141, + "step": 392 + }, + { + "epoch": 0.19973318086525635, + "grad_norm": 3.4563138484954834, + "learning_rate": 7.98780487804878e-06, + "loss": 1.3688, + "step": 393 + }, + { + "epoch": 0.2002414077885776, + "grad_norm": 3.50997257232666, + "learning_rate": 8.008130081300813e-06, + "loss": 1.2373, + "step": 394 + }, + { + "epoch": 0.20074963471189886, + "grad_norm": 3.5368010997772217, + "learning_rate": 8.028455284552846e-06, + "loss": 1.3064, + "step": 395 + }, + { + "epoch": 0.20125786163522014, + "grad_norm": 3.5220799446105957, + "learning_rate": 8.048780487804879e-06, + "loss": 1.2372, + "step": 396 + }, + { + "epoch": 0.2017660885585414, + "grad_norm": 3.81137752532959, + "learning_rate": 8.069105691056911e-06, + "loss": 1.5465, + "step": 397 + }, + { + "epoch": 0.20227431548186264, + "grad_norm": 3.8925790786743164, + "learning_rate": 8.089430894308944e-06, + "loss": 1.3473, + "step": 398 + }, + { + "epoch": 0.20278254240518392, + "grad_norm": 3.4865732192993164, + "learning_rate": 8.109756097560977e-06, + "loss": 1.2192, + "step": 399 + }, + { + "epoch": 0.20329076932850518, + "grad_norm": 3.5314934253692627, + "learning_rate": 8.130081300813009e-06, + "loss": 1.3106, + "step": 400 + }, + { + "epoch": 0.20379899625182643, + "grad_norm": 11.417930603027344, + "learning_rate": 8.150406504065042e-06, + "loss": 1.4589, + "step": 401 + }, + { + "epoch": 0.2043072231751477, + "grad_norm": 3.5613293647766113, + "learning_rate": 8.170731707317073e-06, + "loss": 1.3619, + "step": 402 + }, + { + "epoch": 0.20481545009846897, + "grad_norm": 5.17199182510376, + "learning_rate": 8.191056910569107e-06, + "loss": 1.341, + "step": 403 + }, + { + "epoch": 0.20532367702179022, + "grad_norm": 4.516615390777588, + "learning_rate": 8.21138211382114e-06, + "loss": 1.3727, + "step": 404 + }, + { + "epoch": 0.2058319039451115, + "grad_norm": 3.745323896408081, + "learning_rate": 8.23170731707317e-06, + "loss": 1.2878, + "step": 405 + }, + { + "epoch": 0.20634013086843275, + "grad_norm": 3.2874369621276855, + "learning_rate": 8.252032520325203e-06, + "loss": 1.172, + "step": 406 + }, + { + "epoch": 0.206848357791754, + "grad_norm": 3.345372438430786, + "learning_rate": 8.272357723577238e-06, + "loss": 1.3093, + "step": 407 + }, + { + "epoch": 0.2073565847150753, + "grad_norm": 3.8618834018707275, + "learning_rate": 8.292682926829268e-06, + "loss": 1.2398, + "step": 408 + }, + { + "epoch": 0.20786481163839654, + "grad_norm": 3.3758747577667236, + "learning_rate": 8.313008130081301e-06, + "loss": 1.3063, + "step": 409 + }, + { + "epoch": 0.2083730385617178, + "grad_norm": 3.501466751098633, + "learning_rate": 8.333333333333334e-06, + "loss": 1.3748, + "step": 410 + }, + { + "epoch": 0.20888126548503907, + "grad_norm": 3.5670862197875977, + "learning_rate": 8.353658536585366e-06, + "loss": 1.3696, + "step": 411 + }, + { + "epoch": 0.20938949240836033, + "grad_norm": 3.628492593765259, + "learning_rate": 8.373983739837399e-06, + "loss": 1.2935, + "step": 412 + }, + { + "epoch": 0.2098977193316816, + "grad_norm": 3.188523769378662, + "learning_rate": 8.394308943089432e-06, + "loss": 1.2003, + "step": 413 + }, + { + "epoch": 0.21040594625500286, + "grad_norm": 3.282963991165161, + "learning_rate": 8.414634146341464e-06, + "loss": 1.2503, + "step": 414 + }, + { + "epoch": 0.21091417317832412, + "grad_norm": 3.601407527923584, + "learning_rate": 8.434959349593497e-06, + "loss": 1.2435, + "step": 415 + }, + { + "epoch": 0.2114224001016454, + "grad_norm": 4.200768947601318, + "learning_rate": 8.45528455284553e-06, + "loss": 1.3499, + "step": 416 + }, + { + "epoch": 0.21193062702496665, + "grad_norm": 3.487779378890991, + "learning_rate": 8.475609756097562e-06, + "loss": 1.2928, + "step": 417 + }, + { + "epoch": 0.2124388539482879, + "grad_norm": 3.47430157661438, + "learning_rate": 8.495934959349595e-06, + "loss": 1.3469, + "step": 418 + }, + { + "epoch": 0.21294708087160918, + "grad_norm": 3.8267080783843994, + "learning_rate": 8.516260162601627e-06, + "loss": 1.3764, + "step": 419 + }, + { + "epoch": 0.21345530779493044, + "grad_norm": 3.6177916526794434, + "learning_rate": 8.536585365853658e-06, + "loss": 1.4348, + "step": 420 + }, + { + "epoch": 0.2139635347182517, + "grad_norm": 3.4687182903289795, + "learning_rate": 8.556910569105693e-06, + "loss": 1.319, + "step": 421 + }, + { + "epoch": 0.21447176164157297, + "grad_norm": 3.39560866355896, + "learning_rate": 8.577235772357724e-06, + "loss": 1.3131, + "step": 422 + }, + { + "epoch": 0.21497998856489423, + "grad_norm": 3.492347240447998, + "learning_rate": 8.597560975609756e-06, + "loss": 1.3446, + "step": 423 + }, + { + "epoch": 0.21548821548821548, + "grad_norm": 3.751417636871338, + "learning_rate": 8.617886178861789e-06, + "loss": 1.3222, + "step": 424 + }, + { + "epoch": 0.21599644241153676, + "grad_norm": 3.345554828643799, + "learning_rate": 8.638211382113821e-06, + "loss": 1.2489, + "step": 425 + }, + { + "epoch": 0.216504669334858, + "grad_norm": 3.6721158027648926, + "learning_rate": 8.658536585365854e-06, + "loss": 1.2827, + "step": 426 + }, + { + "epoch": 0.21701289625817927, + "grad_norm": 3.5361924171447754, + "learning_rate": 8.678861788617887e-06, + "loss": 1.3585, + "step": 427 + }, + { + "epoch": 0.21752112318150055, + "grad_norm": 3.324645757675171, + "learning_rate": 8.69918699186992e-06, + "loss": 1.3114, + "step": 428 + }, + { + "epoch": 0.2180293501048218, + "grad_norm": 3.320855140686035, + "learning_rate": 8.719512195121952e-06, + "loss": 1.2281, + "step": 429 + }, + { + "epoch": 0.21853757702814305, + "grad_norm": 3.440333127975464, + "learning_rate": 8.739837398373985e-06, + "loss": 1.4005, + "step": 430 + }, + { + "epoch": 0.21904580395146434, + "grad_norm": 3.48341965675354, + "learning_rate": 8.760162601626017e-06, + "loss": 1.363, + "step": 431 + }, + { + "epoch": 0.2195540308747856, + "grad_norm": 3.2691972255706787, + "learning_rate": 8.78048780487805e-06, + "loss": 1.2695, + "step": 432 + }, + { + "epoch": 0.22006225779810684, + "grad_norm": 4.021475791931152, + "learning_rate": 8.800813008130082e-06, + "loss": 1.4454, + "step": 433 + }, + { + "epoch": 0.22057048472142812, + "grad_norm": 3.26725697517395, + "learning_rate": 8.821138211382113e-06, + "loss": 1.3682, + "step": 434 + }, + { + "epoch": 0.22107871164474938, + "grad_norm": 3.592050790786743, + "learning_rate": 8.841463414634148e-06, + "loss": 1.3953, + "step": 435 + }, + { + "epoch": 0.22158693856807066, + "grad_norm": 3.366631031036377, + "learning_rate": 8.86178861788618e-06, + "loss": 1.29, + "step": 436 + }, + { + "epoch": 0.2220951654913919, + "grad_norm": 3.5437285900115967, + "learning_rate": 8.882113821138211e-06, + "loss": 1.2646, + "step": 437 + }, + { + "epoch": 0.22260339241471316, + "grad_norm": 3.404071569442749, + "learning_rate": 8.902439024390244e-06, + "loss": 1.2194, + "step": 438 + }, + { + "epoch": 0.22311161933803444, + "grad_norm": 3.740020275115967, + "learning_rate": 8.922764227642278e-06, + "loss": 1.1974, + "step": 439 + }, + { + "epoch": 0.2236198462613557, + "grad_norm": 3.812560558319092, + "learning_rate": 8.94308943089431e-06, + "loss": 1.2404, + "step": 440 + }, + { + "epoch": 0.22412807318467695, + "grad_norm": 3.365743637084961, + "learning_rate": 8.963414634146342e-06, + "loss": 1.3007, + "step": 441 + }, + { + "epoch": 0.22463630010799823, + "grad_norm": 3.463697671890259, + "learning_rate": 8.983739837398374e-06, + "loss": 1.2529, + "step": 442 + }, + { + "epoch": 0.22514452703131949, + "grad_norm": 3.325098991394043, + "learning_rate": 9.004065040650407e-06, + "loss": 1.2782, + "step": 443 + }, + { + "epoch": 0.22565275395464074, + "grad_norm": 3.305267810821533, + "learning_rate": 9.02439024390244e-06, + "loss": 1.3544, + "step": 444 + }, + { + "epoch": 0.22616098087796202, + "grad_norm": 3.480679750442505, + "learning_rate": 9.044715447154472e-06, + "loss": 1.3709, + "step": 445 + }, + { + "epoch": 0.22666920780128327, + "grad_norm": 3.7187793254852295, + "learning_rate": 9.065040650406505e-06, + "loss": 1.2159, + "step": 446 + }, + { + "epoch": 0.22717743472460453, + "grad_norm": 3.6196069717407227, + "learning_rate": 9.085365853658538e-06, + "loss": 1.312, + "step": 447 + }, + { + "epoch": 0.2276856616479258, + "grad_norm": 3.43747878074646, + "learning_rate": 9.10569105691057e-06, + "loss": 1.2508, + "step": 448 + }, + { + "epoch": 0.22819388857124706, + "grad_norm": 3.117326021194458, + "learning_rate": 9.126016260162603e-06, + "loss": 1.2848, + "step": 449 + }, + { + "epoch": 0.22870211549456831, + "grad_norm": 3.348893642425537, + "learning_rate": 9.146341463414635e-06, + "loss": 1.2183, + "step": 450 + }, + { + "epoch": 0.2292103424178896, + "grad_norm": 3.716628074645996, + "learning_rate": 9.166666666666666e-06, + "loss": 1.4024, + "step": 451 + }, + { + "epoch": 0.22971856934121085, + "grad_norm": 3.6212241649627686, + "learning_rate": 9.1869918699187e-06, + "loss": 1.3003, + "step": 452 + }, + { + "epoch": 0.2302267962645321, + "grad_norm": 3.806009292602539, + "learning_rate": 9.207317073170733e-06, + "loss": 1.3927, + "step": 453 + }, + { + "epoch": 0.23073502318785338, + "grad_norm": 3.6030616760253906, + "learning_rate": 9.227642276422764e-06, + "loss": 1.2962, + "step": 454 + }, + { + "epoch": 0.23124325011117464, + "grad_norm": 3.7318930625915527, + "learning_rate": 9.247967479674797e-06, + "loss": 1.2296, + "step": 455 + }, + { + "epoch": 0.2317514770344959, + "grad_norm": 3.260894775390625, + "learning_rate": 9.268292682926831e-06, + "loss": 1.3221, + "step": 456 + }, + { + "epoch": 0.23225970395781717, + "grad_norm": 3.47714900970459, + "learning_rate": 9.288617886178862e-06, + "loss": 1.1855, + "step": 457 + }, + { + "epoch": 0.23276793088113842, + "grad_norm": 4.364900588989258, + "learning_rate": 9.308943089430895e-06, + "loss": 1.3621, + "step": 458 + }, + { + "epoch": 0.2332761578044597, + "grad_norm": 3.5738487243652344, + "learning_rate": 9.329268292682927e-06, + "loss": 1.3473, + "step": 459 + }, + { + "epoch": 0.23378438472778096, + "grad_norm": 4.652425289154053, + "learning_rate": 9.34959349593496e-06, + "loss": 1.3563, + "step": 460 + }, + { + "epoch": 0.2342926116511022, + "grad_norm": 7.233104705810547, + "learning_rate": 9.369918699186993e-06, + "loss": 1.4006, + "step": 461 + }, + { + "epoch": 0.2348008385744235, + "grad_norm": 3.273244857788086, + "learning_rate": 9.390243902439025e-06, + "loss": 1.3137, + "step": 462 + }, + { + "epoch": 0.23530906549774475, + "grad_norm": 3.6843795776367188, + "learning_rate": 9.410569105691058e-06, + "loss": 1.3714, + "step": 463 + }, + { + "epoch": 0.235817292421066, + "grad_norm": 3.619368553161621, + "learning_rate": 9.43089430894309e-06, + "loss": 1.282, + "step": 464 + }, + { + "epoch": 0.23632551934438728, + "grad_norm": 3.4482295513153076, + "learning_rate": 9.451219512195122e-06, + "loss": 1.2551, + "step": 465 + }, + { + "epoch": 0.23683374626770853, + "grad_norm": 3.2826528549194336, + "learning_rate": 9.471544715447156e-06, + "loss": 1.2826, + "step": 466 + }, + { + "epoch": 0.2373419731910298, + "grad_norm": 3.5899658203125, + "learning_rate": 9.491869918699188e-06, + "loss": 1.3268, + "step": 467 + }, + { + "epoch": 0.23785020011435107, + "grad_norm": 3.3438339233398438, + "learning_rate": 9.51219512195122e-06, + "loss": 1.3673, + "step": 468 + }, + { + "epoch": 0.23835842703767232, + "grad_norm": 3.659921407699585, + "learning_rate": 9.532520325203252e-06, + "loss": 1.2785, + "step": 469 + }, + { + "epoch": 0.23886665396099357, + "grad_norm": 3.542293071746826, + "learning_rate": 9.552845528455286e-06, + "loss": 1.2533, + "step": 470 + }, + { + "epoch": 0.23937488088431486, + "grad_norm": 3.669058084487915, + "learning_rate": 9.573170731707317e-06, + "loss": 1.1636, + "step": 471 + }, + { + "epoch": 0.2398831078076361, + "grad_norm": 3.8697493076324463, + "learning_rate": 9.59349593495935e-06, + "loss": 1.3559, + "step": 472 + }, + { + "epoch": 0.24039133473095736, + "grad_norm": 3.661998987197876, + "learning_rate": 9.613821138211383e-06, + "loss": 1.3293, + "step": 473 + }, + { + "epoch": 0.24089956165427864, + "grad_norm": 3.7692317962646484, + "learning_rate": 9.634146341463415e-06, + "loss": 1.2875, + "step": 474 + }, + { + "epoch": 0.2414077885775999, + "grad_norm": 3.5682339668273926, + "learning_rate": 9.654471544715448e-06, + "loss": 1.3229, + "step": 475 + }, + { + "epoch": 0.24191601550092115, + "grad_norm": 3.4052696228027344, + "learning_rate": 9.67479674796748e-06, + "loss": 1.3713, + "step": 476 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 3.3954174518585205, + "learning_rate": 9.695121951219513e-06, + "loss": 1.2427, + "step": 477 + }, + { + "epoch": 0.24293246934756368, + "grad_norm": 3.2011301517486572, + "learning_rate": 9.715447154471546e-06, + "loss": 1.2075, + "step": 478 + }, + { + "epoch": 0.24344069627088494, + "grad_norm": 3.5140979290008545, + "learning_rate": 9.735772357723578e-06, + "loss": 1.4365, + "step": 479 + }, + { + "epoch": 0.24394892319420622, + "grad_norm": 3.40429425239563, + "learning_rate": 9.756097560975611e-06, + "loss": 1.1789, + "step": 480 + }, + { + "epoch": 0.24445715011752747, + "grad_norm": 3.4835615158081055, + "learning_rate": 9.776422764227644e-06, + "loss": 1.2674, + "step": 481 + }, + { + "epoch": 0.24496537704084873, + "grad_norm": 3.3621158599853516, + "learning_rate": 9.796747967479675e-06, + "loss": 1.2595, + "step": 482 + }, + { + "epoch": 0.24547360396417, + "grad_norm": 3.61655855178833, + "learning_rate": 9.817073170731707e-06, + "loss": 1.2872, + "step": 483 + }, + { + "epoch": 0.24598183088749126, + "grad_norm": 3.48075795173645, + "learning_rate": 9.837398373983741e-06, + "loss": 1.3344, + "step": 484 + }, + { + "epoch": 0.24649005781081254, + "grad_norm": 3.713700294494629, + "learning_rate": 9.857723577235772e-06, + "loss": 1.3467, + "step": 485 + }, + { + "epoch": 0.2469982847341338, + "grad_norm": 3.270226001739502, + "learning_rate": 9.878048780487805e-06, + "loss": 1.334, + "step": 486 + }, + { + "epoch": 0.24750651165745505, + "grad_norm": 3.2157111167907715, + "learning_rate": 9.898373983739838e-06, + "loss": 1.3273, + "step": 487 + }, + { + "epoch": 0.24801473858077633, + "grad_norm": 3.4948418140411377, + "learning_rate": 9.91869918699187e-06, + "loss": 1.3266, + "step": 488 + }, + { + "epoch": 0.24852296550409758, + "grad_norm": 3.462024450302124, + "learning_rate": 9.939024390243903e-06, + "loss": 1.3567, + "step": 489 + }, + { + "epoch": 0.24903119242741883, + "grad_norm": 3.0976338386535645, + "learning_rate": 9.959349593495936e-06, + "loss": 1.2992, + "step": 490 + }, + { + "epoch": 0.24953941935074012, + "grad_norm": 3.3008170127868652, + "learning_rate": 9.979674796747968e-06, + "loss": 1.3137, + "step": 491 + }, + { + "epoch": 0.25004764627406134, + "grad_norm": 3.765357494354248, + "learning_rate": 1e-05, + "loss": 1.2204, + "step": 492 + }, + { + "epoch": 0.25055587319738265, + "grad_norm": 3.619002342224121, + "learning_rate": 9.999999717338245e-06, + "loss": 1.339, + "step": 493 + }, + { + "epoch": 0.2510641001207039, + "grad_norm": 3.694655418395996, + "learning_rate": 9.99999886935301e-06, + "loss": 1.3753, + "step": 494 + }, + { + "epoch": 0.25157232704402516, + "grad_norm": 3.6122829914093018, + "learning_rate": 9.99999745604439e-06, + "loss": 1.3275, + "step": 495 + }, + { + "epoch": 0.2520805539673464, + "grad_norm": 3.870494842529297, + "learning_rate": 9.999995477412547e-06, + "loss": 1.3107, + "step": 496 + }, + { + "epoch": 0.25258878089066766, + "grad_norm": 3.936599016189575, + "learning_rate": 9.999992933457705e-06, + "loss": 1.2448, + "step": 497 + }, + { + "epoch": 0.253097007813989, + "grad_norm": 3.2846243381500244, + "learning_rate": 9.99998982418015e-06, + "loss": 1.3264, + "step": 498 + }, + { + "epoch": 0.2536052347373102, + "grad_norm": 3.724277973175049, + "learning_rate": 9.999986149580232e-06, + "loss": 1.3372, + "step": 499 + }, + { + "epoch": 0.2541134616606315, + "grad_norm": 3.324705123901367, + "learning_rate": 9.99998190965837e-06, + "loss": 1.3758, + "step": 500 + }, + { + "epoch": 0.2541134616606315, + "eval_loss": 1.3164880275726318, + "eval_runtime": 13.0856, + "eval_samples_per_second": 30.568, + "eval_steps_per_second": 3.821, + "step": 500 + }, + { + "epoch": 0.25462168858395273, + "grad_norm": 4.158553600311279, + "learning_rate": 9.999977104415042e-06, + "loss": 1.4618, + "step": 501 + }, + { + "epoch": 0.255129915507274, + "grad_norm": 4.20340633392334, + "learning_rate": 9.99997173385079e-06, + "loss": 1.3603, + "step": 502 + }, + { + "epoch": 0.25563814243059524, + "grad_norm": 3.5411834716796875, + "learning_rate": 9.999965797966223e-06, + "loss": 1.3046, + "step": 503 + }, + { + "epoch": 0.25614636935391655, + "grad_norm": 3.406993865966797, + "learning_rate": 9.999959296762012e-06, + "loss": 1.3119, + "step": 504 + }, + { + "epoch": 0.2566545962772378, + "grad_norm": 3.4021811485290527, + "learning_rate": 9.999952230238893e-06, + "loss": 1.3131, + "step": 505 + }, + { + "epoch": 0.25716282320055905, + "grad_norm": 3.237227201461792, + "learning_rate": 9.99994459839766e-06, + "loss": 1.2948, + "step": 506 + }, + { + "epoch": 0.2576710501238803, + "grad_norm": 3.6270179748535156, + "learning_rate": 9.999936401239181e-06, + "loss": 1.378, + "step": 507 + }, + { + "epoch": 0.25817927704720156, + "grad_norm": 3.573146343231201, + "learning_rate": 9.999927638764382e-06, + "loss": 1.3479, + "step": 508 + }, + { + "epoch": 0.2586875039705228, + "grad_norm": 3.4049582481384277, + "learning_rate": 9.999918310974252e-06, + "loss": 1.3017, + "step": 509 + }, + { + "epoch": 0.2591957308938441, + "grad_norm": 3.151167392730713, + "learning_rate": 9.999908417869846e-06, + "loss": 1.2649, + "step": 510 + }, + { + "epoch": 0.2597039578171654, + "grad_norm": 3.395052194595337, + "learning_rate": 9.999897959452286e-06, + "loss": 1.2947, + "step": 511 + }, + { + "epoch": 0.26021218474048663, + "grad_norm": 3.3076987266540527, + "learning_rate": 9.999886935722749e-06, + "loss": 1.201, + "step": 512 + }, + { + "epoch": 0.2607204116638079, + "grad_norm": 3.6244215965270996, + "learning_rate": 9.999875346682483e-06, + "loss": 1.3617, + "step": 513 + }, + { + "epoch": 0.26122863858712914, + "grad_norm": 3.355215311050415, + "learning_rate": 9.999863192332803e-06, + "loss": 1.2969, + "step": 514 + }, + { + "epoch": 0.2617368655104504, + "grad_norm": 3.464101552963257, + "learning_rate": 9.999850472675076e-06, + "loss": 1.2228, + "step": 515 + }, + { + "epoch": 0.2622450924337717, + "grad_norm": 3.1731834411621094, + "learning_rate": 9.999837187710746e-06, + "loss": 1.314, + "step": 516 + }, + { + "epoch": 0.26275331935709295, + "grad_norm": 3.4594202041625977, + "learning_rate": 9.999823337441312e-06, + "loss": 1.2405, + "step": 517 + }, + { + "epoch": 0.2632615462804142, + "grad_norm": 3.259009599685669, + "learning_rate": 9.999808921868341e-06, + "loss": 1.2927, + "step": 518 + }, + { + "epoch": 0.26376977320373546, + "grad_norm": 3.5948798656463623, + "learning_rate": 9.999793940993463e-06, + "loss": 1.2082, + "step": 519 + }, + { + "epoch": 0.2642780001270567, + "grad_norm": 3.314972162246704, + "learning_rate": 9.99977839481837e-06, + "loss": 1.2475, + "step": 520 + }, + { + "epoch": 0.264786227050378, + "grad_norm": 3.383493661880493, + "learning_rate": 9.999762283344825e-06, + "loss": 1.2592, + "step": 521 + }, + { + "epoch": 0.2652944539736993, + "grad_norm": 3.365828275680542, + "learning_rate": 9.999745606574642e-06, + "loss": 1.3599, + "step": 522 + }, + { + "epoch": 0.2658026808970205, + "grad_norm": 3.2802915573120117, + "learning_rate": 9.99972836450971e-06, + "loss": 1.3388, + "step": 523 + }, + { + "epoch": 0.2663109078203418, + "grad_norm": 3.3013274669647217, + "learning_rate": 9.999710557151983e-06, + "loss": 1.2858, + "step": 524 + }, + { + "epoch": 0.26681913474366303, + "grad_norm": 3.198275089263916, + "learning_rate": 9.999692184503466e-06, + "loss": 1.2994, + "step": 525 + }, + { + "epoch": 0.2673273616669843, + "grad_norm": 3.4907963275909424, + "learning_rate": 9.999673246566242e-06, + "loss": 1.3816, + "step": 526 + }, + { + "epoch": 0.2678355885903056, + "grad_norm": 3.2818679809570312, + "learning_rate": 9.999653743342452e-06, + "loss": 1.186, + "step": 527 + }, + { + "epoch": 0.26834381551362685, + "grad_norm": 3.373699903488159, + "learning_rate": 9.999633674834299e-06, + "loss": 1.2908, + "step": 528 + }, + { + "epoch": 0.2688520424369481, + "grad_norm": 3.4973933696746826, + "learning_rate": 9.999613041044051e-06, + "loss": 1.4183, + "step": 529 + }, + { + "epoch": 0.26936026936026936, + "grad_norm": 3.5590484142303467, + "learning_rate": 9.999591841974045e-06, + "loss": 1.3278, + "step": 530 + }, + { + "epoch": 0.2698684962835906, + "grad_norm": 3.671595573425293, + "learning_rate": 9.999570077626676e-06, + "loss": 1.3794, + "step": 531 + }, + { + "epoch": 0.27037672320691186, + "grad_norm": 3.295187473297119, + "learning_rate": 9.999547748004403e-06, + "loss": 1.3537, + "step": 532 + }, + { + "epoch": 0.27088495013023317, + "grad_norm": 3.641406536102295, + "learning_rate": 9.999524853109755e-06, + "loss": 1.3603, + "step": 533 + }, + { + "epoch": 0.2713931770535544, + "grad_norm": 3.371995449066162, + "learning_rate": 9.999501392945314e-06, + "loss": 1.2268, + "step": 534 + }, + { + "epoch": 0.2719014039768757, + "grad_norm": 3.432286024093628, + "learning_rate": 9.999477367513739e-06, + "loss": 1.3287, + "step": 535 + }, + { + "epoch": 0.27240963090019693, + "grad_norm": 3.212390184402466, + "learning_rate": 9.999452776817741e-06, + "loss": 1.2798, + "step": 536 + }, + { + "epoch": 0.2729178578235182, + "grad_norm": 3.8736019134521484, + "learning_rate": 9.999427620860107e-06, + "loss": 1.3578, + "step": 537 + }, + { + "epoch": 0.27342608474683944, + "grad_norm": 3.1469552516937256, + "learning_rate": 9.999401899643675e-06, + "loss": 1.3325, + "step": 538 + }, + { + "epoch": 0.27393431167016075, + "grad_norm": 4.098660945892334, + "learning_rate": 9.999375613171356e-06, + "loss": 1.3981, + "step": 539 + }, + { + "epoch": 0.274442538593482, + "grad_norm": 3.2645022869110107, + "learning_rate": 9.999348761446122e-06, + "loss": 1.3094, + "step": 540 + }, + { + "epoch": 0.27495076551680325, + "grad_norm": 3.239898204803467, + "learning_rate": 9.999321344471007e-06, + "loss": 1.2965, + "step": 541 + }, + { + "epoch": 0.2754589924401245, + "grad_norm": 3.435715913772583, + "learning_rate": 9.999293362249114e-06, + "loss": 1.3529, + "step": 542 + }, + { + "epoch": 0.27596721936344576, + "grad_norm": 3.2523412704467773, + "learning_rate": 9.999264814783603e-06, + "loss": 1.3146, + "step": 543 + }, + { + "epoch": 0.27647544628676707, + "grad_norm": 3.3631367683410645, + "learning_rate": 9.999235702077707e-06, + "loss": 1.2696, + "step": 544 + }, + { + "epoch": 0.2769836732100883, + "grad_norm": 3.2622344493865967, + "learning_rate": 9.999206024134714e-06, + "loss": 1.3845, + "step": 545 + }, + { + "epoch": 0.2774919001334096, + "grad_norm": 3.6121559143066406, + "learning_rate": 9.999175780957976e-06, + "loss": 1.3381, + "step": 546 + }, + { + "epoch": 0.27800012705673083, + "grad_norm": 3.354872941970825, + "learning_rate": 9.999144972550922e-06, + "loss": 1.3214, + "step": 547 + }, + { + "epoch": 0.2785083539800521, + "grad_norm": 3.4644815921783447, + "learning_rate": 9.999113598917027e-06, + "loss": 1.3543, + "step": 548 + }, + { + "epoch": 0.27901658090337333, + "grad_norm": 3.3032991886138916, + "learning_rate": 9.999081660059842e-06, + "loss": 1.3811, + "step": 549 + }, + { + "epoch": 0.27952480782669464, + "grad_norm": 3.470670461654663, + "learning_rate": 9.999049155982977e-06, + "loss": 1.3831, + "step": 550 + }, + { + "epoch": 0.2800330347500159, + "grad_norm": 3.5726518630981445, + "learning_rate": 9.999016086690108e-06, + "loss": 1.2807, + "step": 551 + }, + { + "epoch": 0.28054126167333715, + "grad_norm": 3.480273962020874, + "learning_rate": 9.998982452184974e-06, + "loss": 1.3818, + "step": 552 + }, + { + "epoch": 0.2810494885966584, + "grad_norm": 3.783210277557373, + "learning_rate": 9.998948252471375e-06, + "loss": 1.2638, + "step": 553 + }, + { + "epoch": 0.28155771551997966, + "grad_norm": 3.0054821968078613, + "learning_rate": 9.998913487553182e-06, + "loss": 1.2592, + "step": 554 + }, + { + "epoch": 0.2820659424433009, + "grad_norm": 3.3007564544677734, + "learning_rate": 9.998878157434322e-06, + "loss": 1.3479, + "step": 555 + }, + { + "epoch": 0.2825741693666222, + "grad_norm": 3.2451131343841553, + "learning_rate": 9.99884226211879e-06, + "loss": 1.263, + "step": 556 + }, + { + "epoch": 0.28308239628994347, + "grad_norm": 3.73813796043396, + "learning_rate": 9.99880580161065e-06, + "loss": 1.3618, + "step": 557 + }, + { + "epoch": 0.2835906232132647, + "grad_norm": 3.4133875370025635, + "learning_rate": 9.998768775914017e-06, + "loss": 1.3835, + "step": 558 + }, + { + "epoch": 0.284098850136586, + "grad_norm": 3.248453140258789, + "learning_rate": 9.998731185033081e-06, + "loss": 1.3094, + "step": 559 + }, + { + "epoch": 0.28460707705990723, + "grad_norm": 3.074777603149414, + "learning_rate": 9.998693028972092e-06, + "loss": 1.1955, + "step": 560 + }, + { + "epoch": 0.2851153039832285, + "grad_norm": 3.389275312423706, + "learning_rate": 9.998654307735364e-06, + "loss": 1.3009, + "step": 561 + }, + { + "epoch": 0.2856235309065498, + "grad_norm": 3.305894374847412, + "learning_rate": 9.998615021327274e-06, + "loss": 1.2888, + "step": 562 + }, + { + "epoch": 0.28613175782987105, + "grad_norm": 3.0569679737091064, + "learning_rate": 9.998575169752265e-06, + "loss": 1.301, + "step": 563 + }, + { + "epoch": 0.2866399847531923, + "grad_norm": 3.3297672271728516, + "learning_rate": 9.998534753014842e-06, + "loss": 1.2979, + "step": 564 + }, + { + "epoch": 0.28714821167651355, + "grad_norm": 3.3406970500946045, + "learning_rate": 9.998493771119576e-06, + "loss": 1.3016, + "step": 565 + }, + { + "epoch": 0.2876564385998348, + "grad_norm": 3.455514430999756, + "learning_rate": 9.9984522240711e-06, + "loss": 1.2808, + "step": 566 + }, + { + "epoch": 0.2881646655231561, + "grad_norm": 3.438077211380005, + "learning_rate": 9.99841011187411e-06, + "loss": 1.3682, + "step": 567 + }, + { + "epoch": 0.28867289244647737, + "grad_norm": 3.4340884685516357, + "learning_rate": 9.99836743453337e-06, + "loss": 1.2293, + "step": 568 + }, + { + "epoch": 0.2891811193697986, + "grad_norm": 3.3622660636901855, + "learning_rate": 9.998324192053704e-06, + "loss": 1.3429, + "step": 569 + }, + { + "epoch": 0.2896893462931199, + "grad_norm": 3.2343058586120605, + "learning_rate": 9.99828038444e-06, + "loss": 1.2378, + "step": 570 + }, + { + "epoch": 0.29019757321644113, + "grad_norm": 3.1985490322113037, + "learning_rate": 9.998236011697214e-06, + "loss": 1.3157, + "step": 571 + }, + { + "epoch": 0.2907058001397624, + "grad_norm": 3.379235029220581, + "learning_rate": 9.99819107383036e-06, + "loss": 1.3078, + "step": 572 + }, + { + "epoch": 0.2912140270630837, + "grad_norm": 3.259159564971924, + "learning_rate": 9.998145570844519e-06, + "loss": 1.3411, + "step": 573 + }, + { + "epoch": 0.29172225398640494, + "grad_norm": 3.191131591796875, + "learning_rate": 9.99809950274484e-06, + "loss": 1.2504, + "step": 574 + }, + { + "epoch": 0.2922304809097262, + "grad_norm": 3.2074849605560303, + "learning_rate": 9.998052869536526e-06, + "loss": 1.3674, + "step": 575 + }, + { + "epoch": 0.29273870783304745, + "grad_norm": 3.2082672119140625, + "learning_rate": 9.998005671224852e-06, + "loss": 1.2857, + "step": 576 + }, + { + "epoch": 0.2932469347563687, + "grad_norm": 3.390986919403076, + "learning_rate": 9.997957907815158e-06, + "loss": 1.4165, + "step": 577 + }, + { + "epoch": 0.29375516167968996, + "grad_norm": 3.38319993019104, + "learning_rate": 9.997909579312839e-06, + "loss": 1.2715, + "step": 578 + }, + { + "epoch": 0.29426338860301127, + "grad_norm": 4.208193302154541, + "learning_rate": 9.997860685723361e-06, + "loss": 1.2918, + "step": 579 + }, + { + "epoch": 0.2947716155263325, + "grad_norm": 3.22011137008667, + "learning_rate": 9.997811227052251e-06, + "loss": 1.2389, + "step": 580 + }, + { + "epoch": 0.2952798424496538, + "grad_norm": 3.2726387977600098, + "learning_rate": 9.997761203305105e-06, + "loss": 1.3157, + "step": 581 + }, + { + "epoch": 0.295788069372975, + "grad_norm": 3.379770040512085, + "learning_rate": 9.997710614487575e-06, + "loss": 1.2954, + "step": 582 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 3.0684187412261963, + "learning_rate": 9.997659460605382e-06, + "loss": 1.309, + "step": 583 + }, + { + "epoch": 0.29680452321961753, + "grad_norm": 3.5520968437194824, + "learning_rate": 9.99760774166431e-06, + "loss": 1.2515, + "step": 584 + }, + { + "epoch": 0.29731275014293884, + "grad_norm": 3.340465784072876, + "learning_rate": 9.997555457670207e-06, + "loss": 1.1975, + "step": 585 + }, + { + "epoch": 0.2978209770662601, + "grad_norm": 3.183685779571533, + "learning_rate": 9.997502608628984e-06, + "loss": 1.2544, + "step": 586 + }, + { + "epoch": 0.29832920398958135, + "grad_norm": 3.2117257118225098, + "learning_rate": 9.997449194546616e-06, + "loss": 1.2248, + "step": 587 + }, + { + "epoch": 0.2988374309129026, + "grad_norm": 3.3444666862487793, + "learning_rate": 9.997395215429142e-06, + "loss": 1.2858, + "step": 588 + }, + { + "epoch": 0.29934565783622386, + "grad_norm": 3.0064151287078857, + "learning_rate": 9.997340671282667e-06, + "loss": 1.2255, + "step": 589 + }, + { + "epoch": 0.29985388475954516, + "grad_norm": 3.2752397060394287, + "learning_rate": 9.997285562113355e-06, + "loss": 1.3126, + "step": 590 + }, + { + "epoch": 0.3003621116828664, + "grad_norm": 3.286292791366577, + "learning_rate": 9.99722988792744e-06, + "loss": 1.3219, + "step": 591 + }, + { + "epoch": 0.30087033860618767, + "grad_norm": 4.162260055541992, + "learning_rate": 9.997173648731214e-06, + "loss": 1.3552, + "step": 592 + }, + { + "epoch": 0.3013785655295089, + "grad_norm": 3.4235987663269043, + "learning_rate": 9.997116844531039e-06, + "loss": 1.294, + "step": 593 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 3.3392674922943115, + "learning_rate": 9.997059475333332e-06, + "loss": 1.4294, + "step": 594 + }, + { + "epoch": 0.30239501937615143, + "grad_norm": 3.367549180984497, + "learning_rate": 9.997001541144587e-06, + "loss": 1.3199, + "step": 595 + }, + { + "epoch": 0.30290324629947274, + "grad_norm": 3.3252546787261963, + "learning_rate": 9.996943041971348e-06, + "loss": 1.3147, + "step": 596 + }, + { + "epoch": 0.303411473222794, + "grad_norm": 3.1721370220184326, + "learning_rate": 9.996883977820233e-06, + "loss": 1.2498, + "step": 597 + }, + { + "epoch": 0.30391970014611525, + "grad_norm": 3.716733694076538, + "learning_rate": 9.996824348697917e-06, + "loss": 1.2548, + "step": 598 + }, + { + "epoch": 0.3044279270694365, + "grad_norm": 3.3994574546813965, + "learning_rate": 9.996764154611145e-06, + "loss": 1.3619, + "step": 599 + }, + { + "epoch": 0.30493615399275775, + "grad_norm": 3.4203522205352783, + "learning_rate": 9.996703395566721e-06, + "loss": 1.2884, + "step": 600 + }, + { + "epoch": 0.305444380916079, + "grad_norm": 3.305091381072998, + "learning_rate": 9.996642071571514e-06, + "loss": 1.3636, + "step": 601 + }, + { + "epoch": 0.3059526078394003, + "grad_norm": 3.121256113052368, + "learning_rate": 9.996580182632459e-06, + "loss": 1.4095, + "step": 602 + }, + { + "epoch": 0.30646083476272157, + "grad_norm": 3.227128267288208, + "learning_rate": 9.996517728756554e-06, + "loss": 1.3859, + "step": 603 + }, + { + "epoch": 0.3069690616860428, + "grad_norm": 3.152439594268799, + "learning_rate": 9.996454709950859e-06, + "loss": 1.3499, + "step": 604 + }, + { + "epoch": 0.3074772886093641, + "grad_norm": 3.302140235900879, + "learning_rate": 9.996391126222499e-06, + "loss": 1.3407, + "step": 605 + }, + { + "epoch": 0.30798551553268533, + "grad_norm": 3.436461925506592, + "learning_rate": 9.996326977578664e-06, + "loss": 1.2528, + "step": 606 + }, + { + "epoch": 0.3084937424560066, + "grad_norm": 3.0147430896759033, + "learning_rate": 9.996262264026608e-06, + "loss": 1.1042, + "step": 607 + }, + { + "epoch": 0.3090019693793279, + "grad_norm": 3.2218759059906006, + "learning_rate": 9.996196985573644e-06, + "loss": 1.431, + "step": 608 + }, + { + "epoch": 0.30951019630264914, + "grad_norm": 3.731808662414551, + "learning_rate": 9.996131142227156e-06, + "loss": 1.4065, + "step": 609 + }, + { + "epoch": 0.3100184232259704, + "grad_norm": 3.240323781967163, + "learning_rate": 9.996064733994588e-06, + "loss": 1.3583, + "step": 610 + }, + { + "epoch": 0.31052665014929165, + "grad_norm": 3.2610456943511963, + "learning_rate": 9.99599776088345e-06, + "loss": 1.2872, + "step": 611 + }, + { + "epoch": 0.3110348770726129, + "grad_norm": 3.4224603176116943, + "learning_rate": 9.99593022290131e-06, + "loss": 1.2538, + "step": 612 + }, + { + "epoch": 0.3115431039959342, + "grad_norm": 3.205958843231201, + "learning_rate": 9.995862120055807e-06, + "loss": 1.2848, + "step": 613 + }, + { + "epoch": 0.31205133091925547, + "grad_norm": 2.9460086822509766, + "learning_rate": 9.995793452354641e-06, + "loss": 1.2136, + "step": 614 + }, + { + "epoch": 0.3125595578425767, + "grad_norm": 3.2204792499542236, + "learning_rate": 9.995724219805575e-06, + "loss": 1.2838, + "step": 615 + }, + { + "epoch": 0.31306778476589797, + "grad_norm": 3.413954019546509, + "learning_rate": 9.99565442241644e-06, + "loss": 1.4099, + "step": 616 + }, + { + "epoch": 0.3135760116892192, + "grad_norm": 3.393963098526001, + "learning_rate": 9.99558406019512e-06, + "loss": 1.3108, + "step": 617 + }, + { + "epoch": 0.3140842386125405, + "grad_norm": 3.3361024856567383, + "learning_rate": 9.99551313314958e-06, + "loss": 1.3209, + "step": 618 + }, + { + "epoch": 0.3145924655358618, + "grad_norm": 3.162201404571533, + "learning_rate": 9.995441641287833e-06, + "loss": 1.2169, + "step": 619 + }, + { + "epoch": 0.31510069245918304, + "grad_norm": 3.283411979675293, + "learning_rate": 9.995369584617962e-06, + "loss": 1.3413, + "step": 620 + }, + { + "epoch": 0.3156089193825043, + "grad_norm": 3.4232754707336426, + "learning_rate": 9.995296963148118e-06, + "loss": 1.2927, + "step": 621 + }, + { + "epoch": 0.31611714630582555, + "grad_norm": 3.652552604675293, + "learning_rate": 9.99522377688651e-06, + "loss": 1.4328, + "step": 622 + }, + { + "epoch": 0.3166253732291468, + "grad_norm": 3.1629154682159424, + "learning_rate": 9.995150025841412e-06, + "loss": 1.2648, + "step": 623 + }, + { + "epoch": 0.31713360015246805, + "grad_norm": 3.021181106567383, + "learning_rate": 9.995075710021165e-06, + "loss": 1.2518, + "step": 624 + }, + { + "epoch": 0.31764182707578936, + "grad_norm": 3.2148756980895996, + "learning_rate": 9.995000829434167e-06, + "loss": 1.3312, + "step": 625 + }, + { + "epoch": 0.3181500539991106, + "grad_norm": 3.3323326110839844, + "learning_rate": 9.994925384088889e-06, + "loss": 1.2723, + "step": 626 + }, + { + "epoch": 0.31865828092243187, + "grad_norm": 3.3048861026763916, + "learning_rate": 9.994849373993861e-06, + "loss": 1.372, + "step": 627 + }, + { + "epoch": 0.3191665078457531, + "grad_norm": 3.1596617698669434, + "learning_rate": 9.994772799157672e-06, + "loss": 1.159, + "step": 628 + }, + { + "epoch": 0.3196747347690744, + "grad_norm": 3.392035484313965, + "learning_rate": 9.994695659588985e-06, + "loss": 1.4064, + "step": 629 + }, + { + "epoch": 0.32018296169239563, + "grad_norm": 3.708467483520508, + "learning_rate": 9.99461795529652e-06, + "loss": 1.425, + "step": 630 + }, + { + "epoch": 0.32069118861571694, + "grad_norm": 3.287665843963623, + "learning_rate": 9.994539686289063e-06, + "loss": 1.2154, + "step": 631 + }, + { + "epoch": 0.3211994155390382, + "grad_norm": 3.2387781143188477, + "learning_rate": 9.994460852575463e-06, + "loss": 1.3697, + "step": 632 + }, + { + "epoch": 0.32170764246235944, + "grad_norm": 3.511781692504883, + "learning_rate": 9.994381454164635e-06, + "loss": 1.3696, + "step": 633 + }, + { + "epoch": 0.3222158693856807, + "grad_norm": 3.1286818981170654, + "learning_rate": 9.994301491065552e-06, + "loss": 1.2287, + "step": 634 + }, + { + "epoch": 0.32272409630900195, + "grad_norm": 3.539268970489502, + "learning_rate": 9.994220963287258e-06, + "loss": 1.2992, + "step": 635 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 3.2066617012023926, + "learning_rate": 9.994139870838859e-06, + "loss": 1.3689, + "step": 636 + }, + { + "epoch": 0.3237405501556445, + "grad_norm": 3.4815847873687744, + "learning_rate": 9.994058213729523e-06, + "loss": 1.2067, + "step": 637 + }, + { + "epoch": 0.32424877707896577, + "grad_norm": 3.814072370529175, + "learning_rate": 9.993975991968478e-06, + "loss": 1.2652, + "step": 638 + }, + { + "epoch": 0.324757004002287, + "grad_norm": 3.1743524074554443, + "learning_rate": 9.993893205565029e-06, + "loss": 1.3056, + "step": 639 + }, + { + "epoch": 0.3252652309256083, + "grad_norm": 3.4408047199249268, + "learning_rate": 9.993809854528529e-06, + "loss": 1.3515, + "step": 640 + }, + { + "epoch": 0.3257734578489295, + "grad_norm": 3.353102922439575, + "learning_rate": 9.993725938868404e-06, + "loss": 1.322, + "step": 641 + }, + { + "epoch": 0.32628168477225084, + "grad_norm": 4.640409469604492, + "learning_rate": 9.993641458594142e-06, + "loss": 1.4992, + "step": 642 + }, + { + "epoch": 0.3267899116955721, + "grad_norm": 3.294832706451416, + "learning_rate": 9.993556413715294e-06, + "loss": 1.3659, + "step": 643 + }, + { + "epoch": 0.32729813861889334, + "grad_norm": 3.26865553855896, + "learning_rate": 9.993470804241481e-06, + "loss": 1.3908, + "step": 644 + }, + { + "epoch": 0.3278063655422146, + "grad_norm": 3.2061288356781006, + "learning_rate": 9.993384630182375e-06, + "loss": 1.2603, + "step": 645 + }, + { + "epoch": 0.32831459246553585, + "grad_norm": 3.1718034744262695, + "learning_rate": 9.993297891547722e-06, + "loss": 1.3821, + "step": 646 + }, + { + "epoch": 0.3288228193888571, + "grad_norm": 3.1801249980926514, + "learning_rate": 9.99321058834733e-06, + "loss": 1.2118, + "step": 647 + }, + { + "epoch": 0.3293310463121784, + "grad_norm": 3.2288734912872314, + "learning_rate": 9.99312272059107e-06, + "loss": 1.2868, + "step": 648 + }, + { + "epoch": 0.32983927323549966, + "grad_norm": 3.5571651458740234, + "learning_rate": 9.993034288288874e-06, + "loss": 1.223, + "step": 649 + }, + { + "epoch": 0.3303475001588209, + "grad_norm": 3.352027654647827, + "learning_rate": 9.992945291450744e-06, + "loss": 1.2518, + "step": 650 + }, + { + "epoch": 0.33085572708214217, + "grad_norm": 3.242868185043335, + "learning_rate": 9.992855730086741e-06, + "loss": 1.2442, + "step": 651 + }, + { + "epoch": 0.3313639540054634, + "grad_norm": 3.3032219409942627, + "learning_rate": 9.992765604206992e-06, + "loss": 1.3753, + "step": 652 + }, + { + "epoch": 0.3318721809287847, + "grad_norm": 3.234017848968506, + "learning_rate": 9.992674913821685e-06, + "loss": 1.2213, + "step": 653 + }, + { + "epoch": 0.332380407852106, + "grad_norm": 3.0645787715911865, + "learning_rate": 9.992583658941075e-06, + "loss": 1.2599, + "step": 654 + }, + { + "epoch": 0.33288863477542724, + "grad_norm": 3.3873555660247803, + "learning_rate": 9.992491839575481e-06, + "loss": 1.2812, + "step": 655 + }, + { + "epoch": 0.3333968616987485, + "grad_norm": 3.0735232830047607, + "learning_rate": 9.992399455735283e-06, + "loss": 1.1829, + "step": 656 + }, + { + "epoch": 0.33390508862206975, + "grad_norm": 3.1945180892944336, + "learning_rate": 9.992306507430927e-06, + "loss": 1.2562, + "step": 657 + }, + { + "epoch": 0.334413315545391, + "grad_norm": 3.20089054107666, + "learning_rate": 9.992212994672921e-06, + "loss": 1.3315, + "step": 658 + }, + { + "epoch": 0.3349215424687123, + "grad_norm": 3.3600375652313232, + "learning_rate": 9.99211891747184e-06, + "loss": 1.3288, + "step": 659 + }, + { + "epoch": 0.33542976939203356, + "grad_norm": 3.2655248641967773, + "learning_rate": 9.992024275838318e-06, + "loss": 1.2318, + "step": 660 + }, + { + "epoch": 0.3359379963153548, + "grad_norm": 3.1854372024536133, + "learning_rate": 9.991929069783058e-06, + "loss": 1.2953, + "step": 661 + }, + { + "epoch": 0.33644622323867607, + "grad_norm": 3.1260249614715576, + "learning_rate": 9.991833299316824e-06, + "loss": 1.3619, + "step": 662 + }, + { + "epoch": 0.3369544501619973, + "grad_norm": 3.1407597064971924, + "learning_rate": 9.991736964450445e-06, + "loss": 1.2393, + "step": 663 + }, + { + "epoch": 0.3374626770853186, + "grad_norm": 3.2042787075042725, + "learning_rate": 9.991640065194812e-06, + "loss": 1.3299, + "step": 664 + }, + { + "epoch": 0.3379709040086399, + "grad_norm": 3.058418035507202, + "learning_rate": 9.99154260156088e-06, + "loss": 1.2894, + "step": 665 + }, + { + "epoch": 0.33847913093196114, + "grad_norm": 3.146761178970337, + "learning_rate": 9.99144457355967e-06, + "loss": 1.4489, + "step": 666 + }, + { + "epoch": 0.3389873578552824, + "grad_norm": 11.600865364074707, + "learning_rate": 9.991345981202265e-06, + "loss": 1.5436, + "step": 667 + }, + { + "epoch": 0.33949558477860364, + "grad_norm": 3.060974359512329, + "learning_rate": 9.991246824499812e-06, + "loss": 1.2756, + "step": 668 + }, + { + "epoch": 0.3400038117019249, + "grad_norm": 3.2085535526275635, + "learning_rate": 9.991147103463523e-06, + "loss": 1.1935, + "step": 669 + }, + { + "epoch": 0.34051203862524615, + "grad_norm": 3.497408628463745, + "learning_rate": 9.991046818104674e-06, + "loss": 1.3223, + "step": 670 + }, + { + "epoch": 0.34102026554856746, + "grad_norm": 3.2515928745269775, + "learning_rate": 9.990945968434601e-06, + "loss": 1.2761, + "step": 671 + }, + { + "epoch": 0.3415284924718887, + "grad_norm": 3.371119737625122, + "learning_rate": 9.990844554464709e-06, + "loss": 1.245, + "step": 672 + }, + { + "epoch": 0.34203671939520996, + "grad_norm": 3.2016313076019287, + "learning_rate": 9.990742576206462e-06, + "loss": 1.3644, + "step": 673 + }, + { + "epoch": 0.3425449463185312, + "grad_norm": 3.163677453994751, + "learning_rate": 9.990640033671391e-06, + "loss": 1.271, + "step": 674 + }, + { + "epoch": 0.34305317324185247, + "grad_norm": 3.464029312133789, + "learning_rate": 9.99053692687109e-06, + "loss": 1.3403, + "step": 675 + }, + { + "epoch": 0.3435614001651737, + "grad_norm": 3.115363836288452, + "learning_rate": 9.990433255817218e-06, + "loss": 1.2434, + "step": 676 + }, + { + "epoch": 0.34406962708849503, + "grad_norm": 3.0379855632781982, + "learning_rate": 9.990329020521497e-06, + "loss": 1.2424, + "step": 677 + }, + { + "epoch": 0.3445778540118163, + "grad_norm": 3.1256349086761475, + "learning_rate": 9.990224220995709e-06, + "loss": 1.2773, + "step": 678 + }, + { + "epoch": 0.34508608093513754, + "grad_norm": 2.9989559650421143, + "learning_rate": 9.990118857251706e-06, + "loss": 1.2307, + "step": 679 + }, + { + "epoch": 0.3455943078584588, + "grad_norm": 3.4447340965270996, + "learning_rate": 9.990012929301399e-06, + "loss": 1.3264, + "step": 680 + }, + { + "epoch": 0.34610253478178005, + "grad_norm": 3.2726187705993652, + "learning_rate": 9.989906437156766e-06, + "loss": 1.3172, + "step": 681 + }, + { + "epoch": 0.3466107617051013, + "grad_norm": 3.2503907680511475, + "learning_rate": 9.989799380829846e-06, + "loss": 1.2419, + "step": 682 + }, + { + "epoch": 0.3471189886284226, + "grad_norm": 3.216642141342163, + "learning_rate": 9.989691760332748e-06, + "loss": 1.275, + "step": 683 + }, + { + "epoch": 0.34762721555174386, + "grad_norm": 3.044985055923462, + "learning_rate": 9.989583575677633e-06, + "loss": 1.2534, + "step": 684 + }, + { + "epoch": 0.3481354424750651, + "grad_norm": 3.3953421115875244, + "learning_rate": 9.989474826876736e-06, + "loss": 1.3845, + "step": 685 + }, + { + "epoch": 0.34864366939838637, + "grad_norm": 3.6470160484313965, + "learning_rate": 9.989365513942356e-06, + "loss": 1.3019, + "step": 686 + }, + { + "epoch": 0.3491518963217076, + "grad_norm": 3.700324296951294, + "learning_rate": 9.989255636886848e-06, + "loss": 1.3368, + "step": 687 + }, + { + "epoch": 0.34966012324502893, + "grad_norm": 2.9334194660186768, + "learning_rate": 9.989145195722636e-06, + "loss": 1.1772, + "step": 688 + }, + { + "epoch": 0.3501683501683502, + "grad_norm": 3.1360538005828857, + "learning_rate": 9.989034190462207e-06, + "loss": 1.3372, + "step": 689 + }, + { + "epoch": 0.35067657709167144, + "grad_norm": 3.0413472652435303, + "learning_rate": 9.988922621118115e-06, + "loss": 1.3548, + "step": 690 + }, + { + "epoch": 0.3511848040149927, + "grad_norm": 3.3083596229553223, + "learning_rate": 9.988810487702971e-06, + "loss": 1.3764, + "step": 691 + }, + { + "epoch": 0.35169303093831394, + "grad_norm": 3.088041067123413, + "learning_rate": 9.988697790229454e-06, + "loss": 1.3161, + "step": 692 + }, + { + "epoch": 0.3522012578616352, + "grad_norm": 3.1266753673553467, + "learning_rate": 9.988584528710306e-06, + "loss": 1.3091, + "step": 693 + }, + { + "epoch": 0.3527094847849565, + "grad_norm": 3.1496315002441406, + "learning_rate": 9.988470703158334e-06, + "loss": 1.2456, + "step": 694 + }, + { + "epoch": 0.35321771170827776, + "grad_norm": 3.72305965423584, + "learning_rate": 9.988356313586407e-06, + "loss": 1.3824, + "step": 695 + }, + { + "epoch": 0.353725938631599, + "grad_norm": 3.113633632659912, + "learning_rate": 9.988241360007459e-06, + "loss": 1.385, + "step": 696 + }, + { + "epoch": 0.35423416555492027, + "grad_norm": 2.981914758682251, + "learning_rate": 9.988125842434484e-06, + "loss": 1.1441, + "step": 697 + }, + { + "epoch": 0.3547423924782415, + "grad_norm": 3.1952383518218994, + "learning_rate": 9.988009760880548e-06, + "loss": 1.3209, + "step": 698 + }, + { + "epoch": 0.3552506194015628, + "grad_norm": 3.1060612201690674, + "learning_rate": 9.987893115358773e-06, + "loss": 1.2458, + "step": 699 + }, + { + "epoch": 0.3557588463248841, + "grad_norm": 3.365842819213867, + "learning_rate": 9.987775905882346e-06, + "loss": 1.338, + "step": 700 + }, + { + "epoch": 0.35626707324820533, + "grad_norm": 3.0432286262512207, + "learning_rate": 9.987658132464524e-06, + "loss": 1.2491, + "step": 701 + }, + { + "epoch": 0.3567753001715266, + "grad_norm": 3.0596561431884766, + "learning_rate": 9.987539795118617e-06, + "loss": 1.3572, + "step": 702 + }, + { + "epoch": 0.35728352709484784, + "grad_norm": 3.2221055030822754, + "learning_rate": 9.987420893858011e-06, + "loss": 1.3876, + "step": 703 + }, + { + "epoch": 0.3577917540181691, + "grad_norm": 3.2124743461608887, + "learning_rate": 9.987301428696144e-06, + "loss": 1.2375, + "step": 704 + }, + { + "epoch": 0.35829998094149035, + "grad_norm": 3.352320671081543, + "learning_rate": 9.987181399646526e-06, + "loss": 1.4334, + "step": 705 + }, + { + "epoch": 0.35880820786481166, + "grad_norm": 3.2828238010406494, + "learning_rate": 9.987060806722727e-06, + "loss": 1.2911, + "step": 706 + }, + { + "epoch": 0.3593164347881329, + "grad_norm": 3.1434576511383057, + "learning_rate": 9.986939649938385e-06, + "loss": 1.3936, + "step": 707 + }, + { + "epoch": 0.35982466171145416, + "grad_norm": 3.1314871311187744, + "learning_rate": 9.986817929307194e-06, + "loss": 1.2858, + "step": 708 + }, + { + "epoch": 0.3603328886347754, + "grad_norm": 3.170621156692505, + "learning_rate": 9.986695644842918e-06, + "loss": 1.2604, + "step": 709 + }, + { + "epoch": 0.36084111555809667, + "grad_norm": 3.3497283458709717, + "learning_rate": 9.986572796559386e-06, + "loss": 1.2838, + "step": 710 + }, + { + "epoch": 0.361349342481418, + "grad_norm": 3.2710769176483154, + "learning_rate": 9.986449384470483e-06, + "loss": 1.315, + "step": 711 + }, + { + "epoch": 0.36185756940473923, + "grad_norm": 3.350487232208252, + "learning_rate": 9.986325408590165e-06, + "loss": 1.2497, + "step": 712 + }, + { + "epoch": 0.3623657963280605, + "grad_norm": 3.4346296787261963, + "learning_rate": 9.98620086893245e-06, + "loss": 1.3931, + "step": 713 + }, + { + "epoch": 0.36287402325138174, + "grad_norm": 3.1220874786376953, + "learning_rate": 9.986075765511417e-06, + "loss": 1.3431, + "step": 714 + }, + { + "epoch": 0.363382250174703, + "grad_norm": 3.2858989238739014, + "learning_rate": 9.985950098341213e-06, + "loss": 1.304, + "step": 715 + }, + { + "epoch": 0.36389047709802425, + "grad_norm": 3.1637048721313477, + "learning_rate": 9.985823867436045e-06, + "loss": 1.3185, + "step": 716 + }, + { + "epoch": 0.36439870402134555, + "grad_norm": 3.1585402488708496, + "learning_rate": 9.985697072810185e-06, + "loss": 1.3015, + "step": 717 + }, + { + "epoch": 0.3649069309446668, + "grad_norm": 3.1651861667633057, + "learning_rate": 9.98556971447797e-06, + "loss": 1.3635, + "step": 718 + }, + { + "epoch": 0.36541515786798806, + "grad_norm": 3.2013018131256104, + "learning_rate": 9.9854417924538e-06, + "loss": 1.381, + "step": 719 + }, + { + "epoch": 0.3659233847913093, + "grad_norm": 3.0635321140289307, + "learning_rate": 9.985313306752136e-06, + "loss": 1.2533, + "step": 720 + }, + { + "epoch": 0.36643161171463057, + "grad_norm": 2.983309507369995, + "learning_rate": 9.98518425738751e-06, + "loss": 1.2858, + "step": 721 + }, + { + "epoch": 0.3669398386379518, + "grad_norm": 3.1740927696228027, + "learning_rate": 9.985054644374509e-06, + "loss": 1.2483, + "step": 722 + }, + { + "epoch": 0.36744806556127313, + "grad_norm": 3.0193238258361816, + "learning_rate": 9.984924467727787e-06, + "loss": 1.3102, + "step": 723 + }, + { + "epoch": 0.3679562924845944, + "grad_norm": 3.6168391704559326, + "learning_rate": 9.984793727462065e-06, + "loss": 1.2824, + "step": 724 + }, + { + "epoch": 0.36846451940791564, + "grad_norm": 3.6449429988861084, + "learning_rate": 9.984662423592124e-06, + "loss": 1.4594, + "step": 725 + }, + { + "epoch": 0.3689727463312369, + "grad_norm": 3.096966505050659, + "learning_rate": 9.984530556132812e-06, + "loss": 1.2573, + "step": 726 + }, + { + "epoch": 0.36948097325455814, + "grad_norm": 3.231896162033081, + "learning_rate": 9.984398125099033e-06, + "loss": 1.2727, + "step": 727 + }, + { + "epoch": 0.3699892001778794, + "grad_norm": 3.1200449466705322, + "learning_rate": 9.984265130505766e-06, + "loss": 1.3387, + "step": 728 + }, + { + "epoch": 0.3704974271012007, + "grad_norm": 3.424175977706909, + "learning_rate": 9.984131572368045e-06, + "loss": 1.3011, + "step": 729 + }, + { + "epoch": 0.37100565402452196, + "grad_norm": 3.364169120788574, + "learning_rate": 9.983997450700973e-06, + "loss": 1.3665, + "step": 730 + }, + { + "epoch": 0.3715138809478432, + "grad_norm": 3.1565613746643066, + "learning_rate": 9.983862765519711e-06, + "loss": 1.2281, + "step": 731 + }, + { + "epoch": 0.37202210787116446, + "grad_norm": 3.174419403076172, + "learning_rate": 9.98372751683949e-06, + "loss": 1.3035, + "step": 732 + }, + { + "epoch": 0.3725303347944857, + "grad_norm": 2.9651894569396973, + "learning_rate": 9.983591704675602e-06, + "loss": 1.2217, + "step": 733 + }, + { + "epoch": 0.373038561717807, + "grad_norm": 3.3082499504089355, + "learning_rate": 9.9834553290434e-06, + "loss": 1.3253, + "step": 734 + }, + { + "epoch": 0.3735467886411283, + "grad_norm": 3.055314064025879, + "learning_rate": 9.983318389958305e-06, + "loss": 1.2681, + "step": 735 + }, + { + "epoch": 0.37405501556444953, + "grad_norm": 3.4626822471618652, + "learning_rate": 9.983180887435799e-06, + "loss": 1.2864, + "step": 736 + }, + { + "epoch": 0.3745632424877708, + "grad_norm": 2.935825824737549, + "learning_rate": 9.983042821491432e-06, + "loss": 1.1635, + "step": 737 + }, + { + "epoch": 0.37507146941109204, + "grad_norm": 3.4077136516571045, + "learning_rate": 9.982904192140808e-06, + "loss": 1.56, + "step": 738 + }, + { + "epoch": 0.3755796963344133, + "grad_norm": 3.5357930660247803, + "learning_rate": 9.982764999399607e-06, + "loss": 1.3316, + "step": 739 + }, + { + "epoch": 0.3760879232577346, + "grad_norm": 3.308767080307007, + "learning_rate": 9.982625243283566e-06, + "loss": 1.4096, + "step": 740 + }, + { + "epoch": 0.37659615018105586, + "grad_norm": 3.031561851501465, + "learning_rate": 9.982484923808484e-06, + "loss": 1.3236, + "step": 741 + }, + { + "epoch": 0.3771043771043771, + "grad_norm": 3.082707643508911, + "learning_rate": 9.982344040990226e-06, + "loss": 1.3657, + "step": 742 + }, + { + "epoch": 0.37761260402769836, + "grad_norm": 2.883720636367798, + "learning_rate": 9.982202594844723e-06, + "loss": 1.1881, + "step": 743 + }, + { + "epoch": 0.3781208309510196, + "grad_norm": 3.01926851272583, + "learning_rate": 9.982060585387968e-06, + "loss": 1.3477, + "step": 744 + }, + { + "epoch": 0.37862905787434087, + "grad_norm": 2.99509596824646, + "learning_rate": 9.981918012636015e-06, + "loss": 1.2324, + "step": 745 + }, + { + "epoch": 0.3791372847976622, + "grad_norm": 3.1339457035064697, + "learning_rate": 9.981774876604985e-06, + "loss": 1.2635, + "step": 746 + }, + { + "epoch": 0.37964551172098343, + "grad_norm": 3.1058597564697266, + "learning_rate": 9.981631177311061e-06, + "loss": 1.3046, + "step": 747 + }, + { + "epoch": 0.3801537386443047, + "grad_norm": 3.1269471645355225, + "learning_rate": 9.981486914770493e-06, + "loss": 1.2447, + "step": 748 + }, + { + "epoch": 0.38066196556762594, + "grad_norm": 3.224168539047241, + "learning_rate": 9.981342088999588e-06, + "loss": 1.2274, + "step": 749 + }, + { + "epoch": 0.3811701924909472, + "grad_norm": 3.2049806118011475, + "learning_rate": 9.981196700014724e-06, + "loss": 1.2978, + "step": 750 + }, + { + "epoch": 0.38167841941426844, + "grad_norm": 3.1496620178222656, + "learning_rate": 9.981050747832336e-06, + "loss": 1.273, + "step": 751 + }, + { + "epoch": 0.38218664633758975, + "grad_norm": 3.535106897354126, + "learning_rate": 9.98090423246893e-06, + "loss": 1.3022, + "step": 752 + }, + { + "epoch": 0.382694873260911, + "grad_norm": 3.1526551246643066, + "learning_rate": 9.980757153941069e-06, + "loss": 1.1942, + "step": 753 + }, + { + "epoch": 0.38320310018423226, + "grad_norm": 3.3968474864959717, + "learning_rate": 9.980609512265383e-06, + "loss": 1.3029, + "step": 754 + }, + { + "epoch": 0.3837113271075535, + "grad_norm": 3.6863186359405518, + "learning_rate": 9.980461307458564e-06, + "loss": 1.3164, + "step": 755 + }, + { + "epoch": 0.38421955403087477, + "grad_norm": 2.9728426933288574, + "learning_rate": 9.980312539537373e-06, + "loss": 1.2588, + "step": 756 + }, + { + "epoch": 0.3847277809541961, + "grad_norm": 3.1564176082611084, + "learning_rate": 9.980163208518626e-06, + "loss": 1.3021, + "step": 757 + }, + { + "epoch": 0.38523600787751733, + "grad_norm": 3.3139936923980713, + "learning_rate": 9.980013314419208e-06, + "loss": 1.2729, + "step": 758 + }, + { + "epoch": 0.3857442348008386, + "grad_norm": 3.0863771438598633, + "learning_rate": 9.979862857256066e-06, + "loss": 1.3166, + "step": 759 + }, + { + "epoch": 0.38625246172415983, + "grad_norm": 3.377894639968872, + "learning_rate": 9.979711837046212e-06, + "loss": 1.3912, + "step": 760 + }, + { + "epoch": 0.3867606886474811, + "grad_norm": 3.1915252208709717, + "learning_rate": 9.979560253806723e-06, + "loss": 1.3662, + "step": 761 + }, + { + "epoch": 0.38726891557080234, + "grad_norm": 3.0366125106811523, + "learning_rate": 9.979408107554738e-06, + "loss": 1.231, + "step": 762 + }, + { + "epoch": 0.38777714249412365, + "grad_norm": 3.1416783332824707, + "learning_rate": 9.979255398307457e-06, + "loss": 1.2466, + "step": 763 + }, + { + "epoch": 0.3882853694174449, + "grad_norm": 2.884857416152954, + "learning_rate": 9.979102126082145e-06, + "loss": 1.2442, + "step": 764 + }, + { + "epoch": 0.38879359634076616, + "grad_norm": 3.1883974075317383, + "learning_rate": 9.978948290896134e-06, + "loss": 1.3042, + "step": 765 + }, + { + "epoch": 0.3893018232640874, + "grad_norm": 3.1092233657836914, + "learning_rate": 9.978793892766817e-06, + "loss": 1.3102, + "step": 766 + }, + { + "epoch": 0.38981005018740866, + "grad_norm": 3.001688241958618, + "learning_rate": 9.978638931711651e-06, + "loss": 1.3254, + "step": 767 + }, + { + "epoch": 0.3903182771107299, + "grad_norm": 3.205700635910034, + "learning_rate": 9.978483407748154e-06, + "loss": 1.3245, + "step": 768 + }, + { + "epoch": 0.3908265040340512, + "grad_norm": 3.2046477794647217, + "learning_rate": 9.978327320893915e-06, + "loss": 1.2614, + "step": 769 + }, + { + "epoch": 0.3913347309573725, + "grad_norm": 3.1941304206848145, + "learning_rate": 9.978170671166578e-06, + "loss": 1.353, + "step": 770 + }, + { + "epoch": 0.39184295788069373, + "grad_norm": 3.317028522491455, + "learning_rate": 9.978013458583857e-06, + "loss": 1.2896, + "step": 771 + }, + { + "epoch": 0.392351184804015, + "grad_norm": 3.0112125873565674, + "learning_rate": 9.977855683163526e-06, + "loss": 1.276, + "step": 772 + }, + { + "epoch": 0.39285941172733624, + "grad_norm": 3.0274596214294434, + "learning_rate": 9.977697344923425e-06, + "loss": 1.2585, + "step": 773 + }, + { + "epoch": 0.3933676386506575, + "grad_norm": 2.992523193359375, + "learning_rate": 9.977538443881454e-06, + "loss": 1.28, + "step": 774 + }, + { + "epoch": 0.3938758655739788, + "grad_norm": 3.1852054595947266, + "learning_rate": 9.97737898005558e-06, + "loss": 1.3497, + "step": 775 + }, + { + "epoch": 0.39438409249730005, + "grad_norm": 3.218014717102051, + "learning_rate": 9.977218953463836e-06, + "loss": 1.2833, + "step": 776 + }, + { + "epoch": 0.3948923194206213, + "grad_norm": 2.910120725631714, + "learning_rate": 9.97705836412431e-06, + "loss": 1.2687, + "step": 777 + }, + { + "epoch": 0.39540054634394256, + "grad_norm": 3.407662868499756, + "learning_rate": 9.976897212055164e-06, + "loss": 1.3764, + "step": 778 + }, + { + "epoch": 0.3959087732672638, + "grad_norm": 3.326226234436035, + "learning_rate": 9.976735497274615e-06, + "loss": 1.3304, + "step": 779 + }, + { + "epoch": 0.3964170001905851, + "grad_norm": 2.9093177318573, + "learning_rate": 9.976573219800948e-06, + "loss": 1.277, + "step": 780 + }, + { + "epoch": 0.3969252271139064, + "grad_norm": 3.1852495670318604, + "learning_rate": 9.976410379652512e-06, + "loss": 1.3158, + "step": 781 + }, + { + "epoch": 0.39743345403722763, + "grad_norm": 3.149109125137329, + "learning_rate": 9.97624697684772e-06, + "loss": 1.2381, + "step": 782 + }, + { + "epoch": 0.3979416809605489, + "grad_norm": 3.0496628284454346, + "learning_rate": 9.976083011405042e-06, + "loss": 1.2591, + "step": 783 + }, + { + "epoch": 0.39844990788387014, + "grad_norm": 2.9263885021209717, + "learning_rate": 9.975918483343022e-06, + "loss": 1.2457, + "step": 784 + }, + { + "epoch": 0.3989581348071914, + "grad_norm": 2.949040412902832, + "learning_rate": 9.975753392680258e-06, + "loss": 1.2433, + "step": 785 + }, + { + "epoch": 0.3994663617305127, + "grad_norm": 3.1974003314971924, + "learning_rate": 9.975587739435418e-06, + "loss": 1.2861, + "step": 786 + }, + { + "epoch": 0.39997458865383395, + "grad_norm": 3.366123914718628, + "learning_rate": 9.975421523627232e-06, + "loss": 1.2619, + "step": 787 + }, + { + "epoch": 0.4004828155771552, + "grad_norm": 3.0037221908569336, + "learning_rate": 9.975254745274492e-06, + "loss": 1.3039, + "step": 788 + }, + { + "epoch": 0.40099104250047646, + "grad_norm": 3.247976303100586, + "learning_rate": 9.975087404396057e-06, + "loss": 1.3495, + "step": 789 + }, + { + "epoch": 0.4014992694237977, + "grad_norm": 2.977108955383301, + "learning_rate": 9.974919501010844e-06, + "loss": 1.1731, + "step": 790 + }, + { + "epoch": 0.40200749634711896, + "grad_norm": 3.743683099746704, + "learning_rate": 9.97475103513784e-06, + "loss": 1.4369, + "step": 791 + }, + { + "epoch": 0.4025157232704403, + "grad_norm": 3.533647298812866, + "learning_rate": 9.97458200679609e-06, + "loss": 1.3664, + "step": 792 + }, + { + "epoch": 0.4030239501937615, + "grad_norm": 3.04760479927063, + "learning_rate": 9.974412416004706e-06, + "loss": 1.1608, + "step": 793 + }, + { + "epoch": 0.4035321771170828, + "grad_norm": 3.0548715591430664, + "learning_rate": 9.974242262782865e-06, + "loss": 1.1694, + "step": 794 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 2.859910726547241, + "learning_rate": 9.974071547149801e-06, + "loss": 1.2936, + "step": 795 + }, + { + "epoch": 0.4045486309637253, + "grad_norm": 3.3869526386260986, + "learning_rate": 9.973900269124818e-06, + "loss": 1.4214, + "step": 796 + }, + { + "epoch": 0.40505685788704654, + "grad_norm": 3.380077600479126, + "learning_rate": 9.973728428727284e-06, + "loss": 1.3634, + "step": 797 + }, + { + "epoch": 0.40556508481036785, + "grad_norm": 3.0257716178894043, + "learning_rate": 9.973556025976625e-06, + "loss": 1.2793, + "step": 798 + }, + { + "epoch": 0.4060733117336891, + "grad_norm": 3.1302125453948975, + "learning_rate": 9.973383060892335e-06, + "loss": 1.3027, + "step": 799 + }, + { + "epoch": 0.40658153865701036, + "grad_norm": 3.309006690979004, + "learning_rate": 9.973209533493969e-06, + "loss": 1.2625, + "step": 800 + }, + { + "epoch": 0.4070897655803316, + "grad_norm": 3.024994373321533, + "learning_rate": 9.973035443801147e-06, + "loss": 1.2243, + "step": 801 + }, + { + "epoch": 0.40759799250365286, + "grad_norm": 3.1751198768615723, + "learning_rate": 9.972860791833555e-06, + "loss": 1.2211, + "step": 802 + }, + { + "epoch": 0.40810621942697417, + "grad_norm": 3.170717716217041, + "learning_rate": 9.972685577610936e-06, + "loss": 1.2553, + "step": 803 + }, + { + "epoch": 0.4086144463502954, + "grad_norm": 3.22538161277771, + "learning_rate": 9.972509801153102e-06, + "loss": 1.2277, + "step": 804 + }, + { + "epoch": 0.4091226732736167, + "grad_norm": 3.1638424396514893, + "learning_rate": 9.972333462479931e-06, + "loss": 1.2627, + "step": 805 + }, + { + "epoch": 0.40963090019693793, + "grad_norm": 2.9831383228302, + "learning_rate": 9.972156561611354e-06, + "loss": 1.2155, + "step": 806 + }, + { + "epoch": 0.4101391271202592, + "grad_norm": 3.119858980178833, + "learning_rate": 9.971979098567377e-06, + "loss": 1.198, + "step": 807 + }, + { + "epoch": 0.41064735404358044, + "grad_norm": 3.1125288009643555, + "learning_rate": 9.971801073368062e-06, + "loss": 1.2545, + "step": 808 + }, + { + "epoch": 0.41115558096690175, + "grad_norm": 3.114292621612549, + "learning_rate": 9.97162248603354e-06, + "loss": 1.2087, + "step": 809 + }, + { + "epoch": 0.411663807890223, + "grad_norm": 3.1119182109832764, + "learning_rate": 9.971443336584002e-06, + "loss": 1.2883, + "step": 810 + }, + { + "epoch": 0.41217203481354425, + "grad_norm": 3.3735485076904297, + "learning_rate": 9.971263625039702e-06, + "loss": 1.2603, + "step": 811 + }, + { + "epoch": 0.4126802617368655, + "grad_norm": 3.0008327960968018, + "learning_rate": 9.97108335142096e-06, + "loss": 1.3121, + "step": 812 + }, + { + "epoch": 0.41318848866018676, + "grad_norm": 3.1853764057159424, + "learning_rate": 9.97090251574816e-06, + "loss": 1.2841, + "step": 813 + }, + { + "epoch": 0.413696715583508, + "grad_norm": 3.3970205783843994, + "learning_rate": 9.970721118041746e-06, + "loss": 1.3358, + "step": 814 + }, + { + "epoch": 0.4142049425068293, + "grad_norm": 3.416800022125244, + "learning_rate": 9.970539158322229e-06, + "loss": 1.3436, + "step": 815 + }, + { + "epoch": 0.4147131694301506, + "grad_norm": 2.908444404602051, + "learning_rate": 9.970356636610181e-06, + "loss": 1.3395, + "step": 816 + }, + { + "epoch": 0.4152213963534718, + "grad_norm": 3.0709686279296875, + "learning_rate": 9.97017355292624e-06, + "loss": 1.298, + "step": 817 + }, + { + "epoch": 0.4157296232767931, + "grad_norm": 3.745266914367676, + "learning_rate": 9.969989907291106e-06, + "loss": 1.2785, + "step": 818 + }, + { + "epoch": 0.41623785020011433, + "grad_norm": 2.99845290184021, + "learning_rate": 9.969805699725542e-06, + "loss": 1.2763, + "step": 819 + }, + { + "epoch": 0.4167460771234356, + "grad_norm": 3.5009357929229736, + "learning_rate": 9.969620930250377e-06, + "loss": 1.4035, + "step": 820 + }, + { + "epoch": 0.4172543040467569, + "grad_norm": 3.1333866119384766, + "learning_rate": 9.9694355988865e-06, + "loss": 1.2492, + "step": 821 + }, + { + "epoch": 0.41776253097007815, + "grad_norm": 3.015458583831787, + "learning_rate": 9.969249705654866e-06, + "loss": 1.3015, + "step": 822 + }, + { + "epoch": 0.4182707578933994, + "grad_norm": 2.9285178184509277, + "learning_rate": 9.969063250576494e-06, + "loss": 1.2905, + "step": 823 + }, + { + "epoch": 0.41877898481672066, + "grad_norm": 3.2691152095794678, + "learning_rate": 9.968876233672466e-06, + "loss": 1.2708, + "step": 824 + }, + { + "epoch": 0.4192872117400419, + "grad_norm": 3.1857168674468994, + "learning_rate": 9.968688654963926e-06, + "loss": 1.2818, + "step": 825 + }, + { + "epoch": 0.4197954386633632, + "grad_norm": 3.2709298133850098, + "learning_rate": 9.96850051447208e-06, + "loss": 1.2403, + "step": 826 + }, + { + "epoch": 0.42030366558668447, + "grad_norm": 3.037520170211792, + "learning_rate": 9.968311812218203e-06, + "loss": 1.2857, + "step": 827 + }, + { + "epoch": 0.4208118925100057, + "grad_norm": 3.4567365646362305, + "learning_rate": 9.96812254822363e-06, + "loss": 1.3809, + "step": 828 + }, + { + "epoch": 0.421320119433327, + "grad_norm": 3.0860140323638916, + "learning_rate": 9.967932722509762e-06, + "loss": 1.3025, + "step": 829 + }, + { + "epoch": 0.42182834635664823, + "grad_norm": 3.1566691398620605, + "learning_rate": 9.967742335098058e-06, + "loss": 1.3849, + "step": 830 + }, + { + "epoch": 0.4223365732799695, + "grad_norm": 3.086601734161377, + "learning_rate": 9.967551386010046e-06, + "loss": 1.335, + "step": 831 + }, + { + "epoch": 0.4228448002032908, + "grad_norm": 3.1381146907806396, + "learning_rate": 9.967359875267315e-06, + "loss": 1.1581, + "step": 832 + }, + { + "epoch": 0.42335302712661205, + "grad_norm": 3.1009199619293213, + "learning_rate": 9.967167802891519e-06, + "loss": 1.2917, + "step": 833 + }, + { + "epoch": 0.4238612540499333, + "grad_norm": 3.1351935863494873, + "learning_rate": 9.966975168904373e-06, + "loss": 1.3964, + "step": 834 + }, + { + "epoch": 0.42436948097325455, + "grad_norm": 2.7338829040527344, + "learning_rate": 9.966781973327661e-06, + "loss": 1.239, + "step": 835 + }, + { + "epoch": 0.4248777078965758, + "grad_norm": 3.2059786319732666, + "learning_rate": 9.966588216183221e-06, + "loss": 1.1639, + "step": 836 + }, + { + "epoch": 0.42538593481989706, + "grad_norm": 3.4231512546539307, + "learning_rate": 9.966393897492962e-06, + "loss": 1.319, + "step": 837 + }, + { + "epoch": 0.42589416174321837, + "grad_norm": 3.154146909713745, + "learning_rate": 9.966199017278859e-06, + "loss": 1.1938, + "step": 838 + }, + { + "epoch": 0.4264023886665396, + "grad_norm": 3.007706642150879, + "learning_rate": 9.96600357556294e-06, + "loss": 1.3219, + "step": 839 + }, + { + "epoch": 0.4269106155898609, + "grad_norm": 3.235159397125244, + "learning_rate": 9.965807572367306e-06, + "loss": 1.3359, + "step": 840 + }, + { + "epoch": 0.42741884251318213, + "grad_norm": 3.1410484313964844, + "learning_rate": 9.965611007714117e-06, + "loss": 1.3004, + "step": 841 + }, + { + "epoch": 0.4279270694365034, + "grad_norm": 3.1803131103515625, + "learning_rate": 9.965413881625597e-06, + "loss": 1.2798, + "step": 842 + }, + { + "epoch": 0.42843529635982464, + "grad_norm": 2.8185393810272217, + "learning_rate": 9.965216194124035e-06, + "loss": 1.2421, + "step": 843 + }, + { + "epoch": 0.42894352328314594, + "grad_norm": 3.0903513431549072, + "learning_rate": 9.965017945231783e-06, + "loss": 1.3236, + "step": 844 + }, + { + "epoch": 0.4294517502064672, + "grad_norm": 3.6765925884246826, + "learning_rate": 9.964819134971255e-06, + "loss": 1.3905, + "step": 845 + }, + { + "epoch": 0.42995997712978845, + "grad_norm": 3.137418031692505, + "learning_rate": 9.964619763364928e-06, + "loss": 1.173, + "step": 846 + }, + { + "epoch": 0.4304682040531097, + "grad_norm": 2.982210159301758, + "learning_rate": 9.964419830435346e-06, + "loss": 1.2189, + "step": 847 + }, + { + "epoch": 0.43097643097643096, + "grad_norm": 3.154118776321411, + "learning_rate": 9.964219336205114e-06, + "loss": 1.2155, + "step": 848 + }, + { + "epoch": 0.43148465789975227, + "grad_norm": 3.562628984451294, + "learning_rate": 9.9640182806969e-06, + "loss": 1.3424, + "step": 849 + }, + { + "epoch": 0.4319928848230735, + "grad_norm": 3.1238515377044678, + "learning_rate": 9.963816663933438e-06, + "loss": 1.3475, + "step": 850 + }, + { + "epoch": 0.4325011117463948, + "grad_norm": 3.4061684608459473, + "learning_rate": 9.963614485937522e-06, + "loss": 1.3098, + "step": 851 + }, + { + "epoch": 0.433009338669716, + "grad_norm": 2.9898059368133545, + "learning_rate": 9.963411746732012e-06, + "loss": 1.2531, + "step": 852 + }, + { + "epoch": 0.4335175655930373, + "grad_norm": 2.9392600059509277, + "learning_rate": 9.963208446339829e-06, + "loss": 1.2618, + "step": 853 + }, + { + "epoch": 0.43402579251635853, + "grad_norm": 3.1422648429870605, + "learning_rate": 9.963004584783961e-06, + "loss": 1.3015, + "step": 854 + }, + { + "epoch": 0.43453401943967984, + "grad_norm": 3.061648368835449, + "learning_rate": 9.962800162087458e-06, + "loss": 1.2793, + "step": 855 + }, + { + "epoch": 0.4350422463630011, + "grad_norm": 3.354825496673584, + "learning_rate": 9.962595178273432e-06, + "loss": 1.2846, + "step": 856 + }, + { + "epoch": 0.43555047328632235, + "grad_norm": 3.2007317543029785, + "learning_rate": 9.962389633365059e-06, + "loss": 1.246, + "step": 857 + }, + { + "epoch": 0.4360587002096436, + "grad_norm": 3.1026949882507324, + "learning_rate": 9.96218352738558e-06, + "loss": 1.2304, + "step": 858 + }, + { + "epoch": 0.43656692713296485, + "grad_norm": 3.2969212532043457, + "learning_rate": 9.961976860358298e-06, + "loss": 1.1946, + "step": 859 + }, + { + "epoch": 0.4370751540562861, + "grad_norm": 3.416917324066162, + "learning_rate": 9.961769632306579e-06, + "loss": 1.2282, + "step": 860 + }, + { + "epoch": 0.4375833809796074, + "grad_norm": 3.0532281398773193, + "learning_rate": 9.961561843253853e-06, + "loss": 1.2293, + "step": 861 + }, + { + "epoch": 0.43809160790292867, + "grad_norm": 3.875426769256592, + "learning_rate": 9.961353493223613e-06, + "loss": 1.3623, + "step": 862 + }, + { + "epoch": 0.4385998348262499, + "grad_norm": 3.1366961002349854, + "learning_rate": 9.961144582239418e-06, + "loss": 1.1868, + "step": 863 + }, + { + "epoch": 0.4391080617495712, + "grad_norm": 3.866417646408081, + "learning_rate": 9.96093511032489e-06, + "loss": 1.3833, + "step": 864 + }, + { + "epoch": 0.43961628867289243, + "grad_norm": 3.051649808883667, + "learning_rate": 9.96072507750371e-06, + "loss": 1.2557, + "step": 865 + }, + { + "epoch": 0.4401245155962137, + "grad_norm": 3.038184881210327, + "learning_rate": 9.960514483799624e-06, + "loss": 1.267, + "step": 866 + }, + { + "epoch": 0.440632742519535, + "grad_norm": 3.4575061798095703, + "learning_rate": 9.960303329236447e-06, + "loss": 1.4039, + "step": 867 + }, + { + "epoch": 0.44114096944285625, + "grad_norm": 3.2219109535217285, + "learning_rate": 9.960091613838048e-06, + "loss": 1.3335, + "step": 868 + }, + { + "epoch": 0.4416491963661775, + "grad_norm": 3.134032964706421, + "learning_rate": 9.959879337628368e-06, + "loss": 1.3197, + "step": 869 + }, + { + "epoch": 0.44215742328949875, + "grad_norm": 3.1833622455596924, + "learning_rate": 9.95966650063141e-06, + "loss": 1.2531, + "step": 870 + }, + { + "epoch": 0.44266565021282, + "grad_norm": 2.999913215637207, + "learning_rate": 9.959453102871231e-06, + "loss": 1.1841, + "step": 871 + }, + { + "epoch": 0.4431738771361413, + "grad_norm": 3.2226994037628174, + "learning_rate": 9.959239144371966e-06, + "loss": 1.302, + "step": 872 + }, + { + "epoch": 0.44368210405946257, + "grad_norm": 3.1408486366271973, + "learning_rate": 9.959024625157804e-06, + "loss": 1.2729, + "step": 873 + }, + { + "epoch": 0.4441903309827838, + "grad_norm": 3.2160913944244385, + "learning_rate": 9.958809545252997e-06, + "loss": 1.266, + "step": 874 + }, + { + "epoch": 0.4446985579061051, + "grad_norm": 3.3626604080200195, + "learning_rate": 9.958593904681866e-06, + "loss": 1.3973, + "step": 875 + }, + { + "epoch": 0.4452067848294263, + "grad_norm": 3.3469786643981934, + "learning_rate": 9.958377703468792e-06, + "loss": 1.282, + "step": 876 + }, + { + "epoch": 0.4457150117527476, + "grad_norm": 3.2448103427886963, + "learning_rate": 9.95816094163822e-06, + "loss": 1.2757, + "step": 877 + }, + { + "epoch": 0.4462232386760689, + "grad_norm": 4.24213171005249, + "learning_rate": 9.957943619214653e-06, + "loss": 1.3377, + "step": 878 + }, + { + "epoch": 0.44673146559939014, + "grad_norm": 3.2333717346191406, + "learning_rate": 9.95772573622267e-06, + "loss": 1.3042, + "step": 879 + }, + { + "epoch": 0.4472396925227114, + "grad_norm": 3.0316765308380127, + "learning_rate": 9.957507292686902e-06, + "loss": 1.3528, + "step": 880 + }, + { + "epoch": 0.44774791944603265, + "grad_norm": 2.985063314437866, + "learning_rate": 9.957288288632048e-06, + "loss": 1.2457, + "step": 881 + }, + { + "epoch": 0.4482561463693539, + "grad_norm": 2.8933520317077637, + "learning_rate": 9.957068724082868e-06, + "loss": 1.2641, + "step": 882 + }, + { + "epoch": 0.44876437329267516, + "grad_norm": 3.3127031326293945, + "learning_rate": 9.95684859906419e-06, + "loss": 1.3061, + "step": 883 + }, + { + "epoch": 0.44927260021599646, + "grad_norm": 3.223618984222412, + "learning_rate": 9.9566279136009e-06, + "loss": 1.3157, + "step": 884 + }, + { + "epoch": 0.4497808271393177, + "grad_norm": 2.9213273525238037, + "learning_rate": 9.956406667717951e-06, + "loss": 1.307, + "step": 885 + }, + { + "epoch": 0.45028905406263897, + "grad_norm": 3.295760154724121, + "learning_rate": 9.956184861440357e-06, + "loss": 1.1735, + "step": 886 + }, + { + "epoch": 0.4507972809859602, + "grad_norm": 3.401263952255249, + "learning_rate": 9.955962494793197e-06, + "loss": 1.3738, + "step": 887 + }, + { + "epoch": 0.4513055079092815, + "grad_norm": 2.9773504734039307, + "learning_rate": 9.955739567801613e-06, + "loss": 1.229, + "step": 888 + }, + { + "epoch": 0.45181373483260273, + "grad_norm": 6.719383239746094, + "learning_rate": 9.95551608049081e-06, + "loss": 1.4442, + "step": 889 + }, + { + "epoch": 0.45232196175592404, + "grad_norm": 3.0398476123809814, + "learning_rate": 9.955292032886057e-06, + "loss": 1.2627, + "step": 890 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 2.933922290802002, + "learning_rate": 9.955067425012685e-06, + "loss": 1.2333, + "step": 891 + }, + { + "epoch": 0.45333841560256655, + "grad_norm": 3.1984505653381348, + "learning_rate": 9.95484225689609e-06, + "loss": 1.3416, + "step": 892 + }, + { + "epoch": 0.4538466425258878, + "grad_norm": 3.189798593521118, + "learning_rate": 9.95461652856173e-06, + "loss": 1.2928, + "step": 893 + }, + { + "epoch": 0.45435486944920905, + "grad_norm": 3.028228759765625, + "learning_rate": 9.954390240035127e-06, + "loss": 1.2474, + "step": 894 + }, + { + "epoch": 0.45486309637253036, + "grad_norm": 3.0100460052490234, + "learning_rate": 9.954163391341867e-06, + "loss": 1.2952, + "step": 895 + }, + { + "epoch": 0.4553713232958516, + "grad_norm": 3.1047329902648926, + "learning_rate": 9.953935982507597e-06, + "loss": 1.2254, + "step": 896 + }, + { + "epoch": 0.45587955021917287, + "grad_norm": 3.1082210540771484, + "learning_rate": 9.95370801355803e-06, + "loss": 1.1121, + "step": 897 + }, + { + "epoch": 0.4563877771424941, + "grad_norm": 3.420098304748535, + "learning_rate": 9.953479484518943e-06, + "loss": 1.221, + "step": 898 + }, + { + "epoch": 0.4568960040658154, + "grad_norm": 3.4203615188598633, + "learning_rate": 9.953250395416172e-06, + "loss": 1.2991, + "step": 899 + }, + { + "epoch": 0.45740423098913663, + "grad_norm": 3.020646572113037, + "learning_rate": 9.953020746275618e-06, + "loss": 1.2723, + "step": 900 + }, + { + "epoch": 0.45791245791245794, + "grad_norm": 3.2635576725006104, + "learning_rate": 9.95279053712325e-06, + "loss": 1.3714, + "step": 901 + }, + { + "epoch": 0.4584206848357792, + "grad_norm": 2.987079381942749, + "learning_rate": 9.952559767985093e-06, + "loss": 1.2517, + "step": 902 + }, + { + "epoch": 0.45892891175910044, + "grad_norm": 2.9069972038269043, + "learning_rate": 9.95232843888724e-06, + "loss": 1.2647, + "step": 903 + }, + { + "epoch": 0.4594371386824217, + "grad_norm": 3.121272087097168, + "learning_rate": 9.952096549855846e-06, + "loss": 1.3379, + "step": 904 + }, + { + "epoch": 0.45994536560574295, + "grad_norm": 2.9536068439483643, + "learning_rate": 9.95186410091713e-06, + "loss": 1.2483, + "step": 905 + }, + { + "epoch": 0.4604535925290642, + "grad_norm": 3.0364537239074707, + "learning_rate": 9.951631092097373e-06, + "loss": 1.2642, + "step": 906 + }, + { + "epoch": 0.4609618194523855, + "grad_norm": 3.0341713428497314, + "learning_rate": 9.951397523422923e-06, + "loss": 1.3138, + "step": 907 + }, + { + "epoch": 0.46147004637570677, + "grad_norm": 3.261298656463623, + "learning_rate": 9.951163394920185e-06, + "loss": 1.286, + "step": 908 + }, + { + "epoch": 0.461978273299028, + "grad_norm": 3.1730971336364746, + "learning_rate": 9.95092870661563e-06, + "loss": 1.2841, + "step": 909 + }, + { + "epoch": 0.4624865002223493, + "grad_norm": 3.372532606124878, + "learning_rate": 9.950693458535796e-06, + "loss": 1.3713, + "step": 910 + }, + { + "epoch": 0.4629947271456705, + "grad_norm": 3.6603589057922363, + "learning_rate": 9.950457650707281e-06, + "loss": 1.3572, + "step": 911 + }, + { + "epoch": 0.4635029540689918, + "grad_norm": 3.153555154800415, + "learning_rate": 9.950221283156744e-06, + "loss": 1.3132, + "step": 912 + }, + { + "epoch": 0.4640111809923131, + "grad_norm": 2.9425718784332275, + "learning_rate": 9.94998435591091e-06, + "loss": 1.1842, + "step": 913 + }, + { + "epoch": 0.46451940791563434, + "grad_norm": 3.12605357170105, + "learning_rate": 9.94974686899657e-06, + "loss": 1.3627, + "step": 914 + }, + { + "epoch": 0.4650276348389556, + "grad_norm": 3.0458600521087646, + "learning_rate": 9.949508822440574e-06, + "loss": 1.2577, + "step": 915 + }, + { + "epoch": 0.46553586176227685, + "grad_norm": 3.2679193019866943, + "learning_rate": 9.949270216269837e-06, + "loss": 1.2647, + "step": 916 + }, + { + "epoch": 0.4660440886855981, + "grad_norm": 3.032907724380493, + "learning_rate": 9.949031050511335e-06, + "loss": 1.2442, + "step": 917 + }, + { + "epoch": 0.4665523156089194, + "grad_norm": 3.104398727416992, + "learning_rate": 9.94879132519211e-06, + "loss": 1.3335, + "step": 918 + }, + { + "epoch": 0.46706054253224066, + "grad_norm": 3.429504632949829, + "learning_rate": 9.948551040339269e-06, + "loss": 1.3438, + "step": 919 + }, + { + "epoch": 0.4675687694555619, + "grad_norm": 3.1915969848632812, + "learning_rate": 9.948310195979976e-06, + "loss": 1.2604, + "step": 920 + }, + { + "epoch": 0.46807699637888317, + "grad_norm": 3.0310678482055664, + "learning_rate": 9.948068792141465e-06, + "loss": 1.253, + "step": 921 + }, + { + "epoch": 0.4685852233022044, + "grad_norm": 3.172191858291626, + "learning_rate": 9.947826828851029e-06, + "loss": 1.2546, + "step": 922 + }, + { + "epoch": 0.4690934502255257, + "grad_norm": 3.4849483966827393, + "learning_rate": 9.947584306136024e-06, + "loss": 1.2744, + "step": 923 + }, + { + "epoch": 0.469601677148847, + "grad_norm": 3.4134442806243896, + "learning_rate": 9.947341224023875e-06, + "loss": 1.4603, + "step": 924 + }, + { + "epoch": 0.47010990407216824, + "grad_norm": 3.0923573970794678, + "learning_rate": 9.94709758254206e-06, + "loss": 1.3375, + "step": 925 + }, + { + "epoch": 0.4706181309954895, + "grad_norm": 3.329230546951294, + "learning_rate": 9.946853381718133e-06, + "loss": 1.1899, + "step": 926 + }, + { + "epoch": 0.47112635791881075, + "grad_norm": 2.9873125553131104, + "learning_rate": 9.946608621579698e-06, + "loss": 1.3432, + "step": 927 + }, + { + "epoch": 0.471634584842132, + "grad_norm": 3.530097723007202, + "learning_rate": 9.946363302154434e-06, + "loss": 1.1975, + "step": 928 + }, + { + "epoch": 0.47214281176545325, + "grad_norm": 3.5325372219085693, + "learning_rate": 9.946117423470074e-06, + "loss": 1.2736, + "step": 929 + }, + { + "epoch": 0.47265103868877456, + "grad_norm": 3.143618106842041, + "learning_rate": 9.94587098555442e-06, + "loss": 1.3366, + "step": 930 + }, + { + "epoch": 0.4731592656120958, + "grad_norm": 3.117429256439209, + "learning_rate": 9.945623988435336e-06, + "loss": 1.3636, + "step": 931 + }, + { + "epoch": 0.47366749253541707, + "grad_norm": 3.4205844402313232, + "learning_rate": 9.94537643214075e-06, + "loss": 1.3578, + "step": 932 + }, + { + "epoch": 0.4741757194587383, + "grad_norm": 3.8048481941223145, + "learning_rate": 9.945128316698647e-06, + "loss": 1.4087, + "step": 933 + }, + { + "epoch": 0.4746839463820596, + "grad_norm": 4.365840435028076, + "learning_rate": 9.944879642137085e-06, + "loss": 1.1789, + "step": 934 + }, + { + "epoch": 0.4751921733053808, + "grad_norm": 3.3367462158203125, + "learning_rate": 9.944630408484177e-06, + "loss": 1.2769, + "step": 935 + }, + { + "epoch": 0.47570040022870214, + "grad_norm": 3.1642816066741943, + "learning_rate": 9.944380615768104e-06, + "loss": 1.3854, + "step": 936 + }, + { + "epoch": 0.4762086271520234, + "grad_norm": 3.0635826587677, + "learning_rate": 9.944130264017109e-06, + "loss": 1.2968, + "step": 937 + }, + { + "epoch": 0.47671685407534464, + "grad_norm": 3.5414836406707764, + "learning_rate": 9.943879353259496e-06, + "loss": 1.2829, + "step": 938 + }, + { + "epoch": 0.4772250809986659, + "grad_norm": 2.936600923538208, + "learning_rate": 9.943627883523638e-06, + "loss": 1.2875, + "step": 939 + }, + { + "epoch": 0.47773330792198715, + "grad_norm": 3.4069905281066895, + "learning_rate": 9.943375854837963e-06, + "loss": 1.3088, + "step": 940 + }, + { + "epoch": 0.4782415348453084, + "grad_norm": 2.994814872741699, + "learning_rate": 9.94312326723097e-06, + "loss": 1.25, + "step": 941 + }, + { + "epoch": 0.4787497617686297, + "grad_norm": 3.145922899246216, + "learning_rate": 9.942870120731217e-06, + "loss": 1.1929, + "step": 942 + }, + { + "epoch": 0.47925798869195096, + "grad_norm": 2.976090908050537, + "learning_rate": 9.942616415367323e-06, + "loss": 1.2835, + "step": 943 + }, + { + "epoch": 0.4797662156152722, + "grad_norm": 3.158318281173706, + "learning_rate": 9.942362151167977e-06, + "loss": 1.3596, + "step": 944 + }, + { + "epoch": 0.48027444253859347, + "grad_norm": 3.21836519241333, + "learning_rate": 9.942107328161926e-06, + "loss": 1.3446, + "step": 945 + }, + { + "epoch": 0.4807826694619147, + "grad_norm": 2.979194402694702, + "learning_rate": 9.941851946377979e-06, + "loss": 1.2835, + "step": 946 + }, + { + "epoch": 0.48129089638523603, + "grad_norm": 3.823063850402832, + "learning_rate": 9.941596005845014e-06, + "loss": 1.2849, + "step": 947 + }, + { + "epoch": 0.4817991233085573, + "grad_norm": 3.020623207092285, + "learning_rate": 9.941339506591968e-06, + "loss": 1.3398, + "step": 948 + }, + { + "epoch": 0.48230735023187854, + "grad_norm": 3.188835382461548, + "learning_rate": 9.941082448647842e-06, + "loss": 1.3944, + "step": 949 + }, + { + "epoch": 0.4828155771551998, + "grad_norm": 3.160069704055786, + "learning_rate": 9.9408248320417e-06, + "loss": 1.264, + "step": 950 + }, + { + "epoch": 0.48332380407852105, + "grad_norm": 2.99892258644104, + "learning_rate": 9.940566656802667e-06, + "loss": 1.2279, + "step": 951 + }, + { + "epoch": 0.4838320310018423, + "grad_norm": 3.09138560295105, + "learning_rate": 9.940307922959938e-06, + "loss": 1.2021, + "step": 952 + }, + { + "epoch": 0.4843402579251636, + "grad_norm": 2.999363660812378, + "learning_rate": 9.940048630542765e-06, + "loss": 1.2779, + "step": 953 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 3.062927484512329, + "learning_rate": 9.93978877958046e-06, + "loss": 1.2581, + "step": 954 + }, + { + "epoch": 0.4853567117718061, + "grad_norm": 3.0736305713653564, + "learning_rate": 9.939528370102412e-06, + "loss": 1.2768, + "step": 955 + }, + { + "epoch": 0.48586493869512737, + "grad_norm": 3.21579647064209, + "learning_rate": 9.939267402138058e-06, + "loss": 1.2204, + "step": 956 + }, + { + "epoch": 0.4863731656184486, + "grad_norm": 3.127753973007202, + "learning_rate": 9.939005875716904e-06, + "loss": 1.2109, + "step": 957 + }, + { + "epoch": 0.4868813925417699, + "grad_norm": 3.4368927478790283, + "learning_rate": 9.938743790868523e-06, + "loss": 1.3368, + "step": 958 + }, + { + "epoch": 0.4873896194650912, + "grad_norm": 3.072741985321045, + "learning_rate": 9.938481147622545e-06, + "loss": 1.2094, + "step": 959 + }, + { + "epoch": 0.48789784638841244, + "grad_norm": 3.4925167560577393, + "learning_rate": 9.938217946008665e-06, + "loss": 1.3443, + "step": 960 + }, + { + "epoch": 0.4884060733117337, + "grad_norm": 3.1357178688049316, + "learning_rate": 9.937954186056644e-06, + "loss": 1.2344, + "step": 961 + }, + { + "epoch": 0.48891430023505494, + "grad_norm": 2.8915724754333496, + "learning_rate": 9.937689867796303e-06, + "loss": 1.2941, + "step": 962 + }, + { + "epoch": 0.4894225271583762, + "grad_norm": 2.945512533187866, + "learning_rate": 9.937424991257526e-06, + "loss": 1.3199, + "step": 963 + }, + { + "epoch": 0.48993075408169745, + "grad_norm": 3.0827341079711914, + "learning_rate": 9.937159556470263e-06, + "loss": 1.2625, + "step": 964 + }, + { + "epoch": 0.49043898100501876, + "grad_norm": 2.878173828125, + "learning_rate": 9.936893563464525e-06, + "loss": 1.3022, + "step": 965 + }, + { + "epoch": 0.49094720792834, + "grad_norm": 2.975311040878296, + "learning_rate": 9.936627012270385e-06, + "loss": 1.2563, + "step": 966 + }, + { + "epoch": 0.49145543485166127, + "grad_norm": 3.058943510055542, + "learning_rate": 9.93635990291798e-06, + "loss": 1.2574, + "step": 967 + }, + { + "epoch": 0.4919636617749825, + "grad_norm": 3.2917304039001465, + "learning_rate": 9.936092235437515e-06, + "loss": 1.2649, + "step": 968 + }, + { + "epoch": 0.4924718886983038, + "grad_norm": 3.0306715965270996, + "learning_rate": 9.93582400985925e-06, + "loss": 1.2129, + "step": 969 + }, + { + "epoch": 0.4929801156216251, + "grad_norm": 3.389181137084961, + "learning_rate": 9.935555226213512e-06, + "loss": 1.2894, + "step": 970 + }, + { + "epoch": 0.49348834254494633, + "grad_norm": 2.8703081607818604, + "learning_rate": 9.935285884530693e-06, + "loss": 1.2568, + "step": 971 + }, + { + "epoch": 0.4939965694682676, + "grad_norm": 3.097668170928955, + "learning_rate": 9.935015984841244e-06, + "loss": 1.1949, + "step": 972 + }, + { + "epoch": 0.49450479639158884, + "grad_norm": 3.344644546508789, + "learning_rate": 9.93474552717568e-06, + "loss": 1.3539, + "step": 973 + }, + { + "epoch": 0.4950130233149101, + "grad_norm": 2.9466795921325684, + "learning_rate": 9.934474511564583e-06, + "loss": 1.2893, + "step": 974 + }, + { + "epoch": 0.49552125023823135, + "grad_norm": 3.2382895946502686, + "learning_rate": 9.934202938038595e-06, + "loss": 1.1904, + "step": 975 + }, + { + "epoch": 0.49602947716155266, + "grad_norm": 3.703711986541748, + "learning_rate": 9.93393080662842e-06, + "loss": 1.3855, + "step": 976 + }, + { + "epoch": 0.4965377040848739, + "grad_norm": 2.887328863143921, + "learning_rate": 9.933658117364829e-06, + "loss": 1.1818, + "step": 977 + }, + { + "epoch": 0.49704593100819516, + "grad_norm": 3.141327381134033, + "learning_rate": 9.93338487027865e-06, + "loss": 1.3616, + "step": 978 + }, + { + "epoch": 0.4975541579315164, + "grad_norm": 3.216190814971924, + "learning_rate": 9.93311106540078e-06, + "loss": 1.3995, + "step": 979 + }, + { + "epoch": 0.49806238485483767, + "grad_norm": 2.990403175354004, + "learning_rate": 9.932836702762173e-06, + "loss": 1.1847, + "step": 980 + }, + { + "epoch": 0.4985706117781589, + "grad_norm": 2.8127925395965576, + "learning_rate": 9.932561782393858e-06, + "loss": 1.1195, + "step": 981 + }, + { + "epoch": 0.49907883870148023, + "grad_norm": 3.067380428314209, + "learning_rate": 9.93228630432691e-06, + "loss": 1.3241, + "step": 982 + }, + { + "epoch": 0.4995870656248015, + "grad_norm": 3.2635014057159424, + "learning_rate": 9.932010268592479e-06, + "loss": 1.4408, + "step": 983 + }, + { + "epoch": 0.5000952925481227, + "grad_norm": 3.01632022857666, + "learning_rate": 9.931733675221776e-06, + "loss": 1.3519, + "step": 984 + }, + { + "epoch": 0.500603519471444, + "grad_norm": 3.1168856620788574, + "learning_rate": 9.931456524246073e-06, + "loss": 1.2522, + "step": 985 + }, + { + "epoch": 0.5011117463947653, + "grad_norm": 3.0207486152648926, + "learning_rate": 9.931178815696706e-06, + "loss": 1.3152, + "step": 986 + }, + { + "epoch": 0.5016199733180865, + "grad_norm": 3.0515527725219727, + "learning_rate": 9.930900549605077e-06, + "loss": 1.2104, + "step": 987 + }, + { + "epoch": 0.5021282002414078, + "grad_norm": 2.985316514968872, + "learning_rate": 9.93062172600264e-06, + "loss": 1.2172, + "step": 988 + }, + { + "epoch": 0.502636427164729, + "grad_norm": 3.1258912086486816, + "learning_rate": 9.930342344920929e-06, + "loss": 1.2094, + "step": 989 + }, + { + "epoch": 0.5031446540880503, + "grad_norm": 3.497823476791382, + "learning_rate": 9.930062406391527e-06, + "loss": 1.2589, + "step": 990 + }, + { + "epoch": 0.5036528810113716, + "grad_norm": 2.91703462600708, + "learning_rate": 9.929781910446088e-06, + "loss": 1.2083, + "step": 991 + }, + { + "epoch": 0.5041611079346928, + "grad_norm": 2.9708058834075928, + "learning_rate": 9.929500857116326e-06, + "loss": 1.2771, + "step": 992 + }, + { + "epoch": 0.5046693348580141, + "grad_norm": 3.113933563232422, + "learning_rate": 9.929219246434014e-06, + "loss": 1.1901, + "step": 993 + }, + { + "epoch": 0.5051775617813353, + "grad_norm": 3.2545571327209473, + "learning_rate": 9.928937078430996e-06, + "loss": 1.4007, + "step": 994 + }, + { + "epoch": 0.5056857887046566, + "grad_norm": 3.0928285121917725, + "learning_rate": 9.928654353139175e-06, + "loss": 1.2483, + "step": 995 + }, + { + "epoch": 0.506194015627978, + "grad_norm": 3.1192171573638916, + "learning_rate": 9.928371070590517e-06, + "loss": 1.2464, + "step": 996 + }, + { + "epoch": 0.5067022425512991, + "grad_norm": 3.0406901836395264, + "learning_rate": 9.928087230817053e-06, + "loss": 1.3043, + "step": 997 + }, + { + "epoch": 0.5072104694746205, + "grad_norm": 3.2588446140289307, + "learning_rate": 9.92780283385087e-06, + "loss": 1.2525, + "step": 998 + }, + { + "epoch": 0.5077186963979416, + "grad_norm": 3.1698226928710938, + "learning_rate": 9.927517879724127e-06, + "loss": 1.2424, + "step": 999 + }, + { + "epoch": 0.508226923321263, + "grad_norm": 3.1326828002929688, + "learning_rate": 9.927232368469044e-06, + "loss": 1.2272, + "step": 1000 + }, + { + "epoch": 0.508226923321263, + "eval_loss": 1.2929835319519043, + "eval_runtime": 12.5577, + "eval_samples_per_second": 31.853, + "eval_steps_per_second": 3.982, + "step": 1000 + }, + { + "epoch": 0.5087351502445842, + "grad_norm": 2.9654858112335205, + "learning_rate": 9.926946300117897e-06, + "loss": 1.2446, + "step": 1001 + }, + { + "epoch": 0.5092433771679055, + "grad_norm": 2.9097492694854736, + "learning_rate": 9.926659674703036e-06, + "loss": 1.3136, + "step": 1002 + }, + { + "epoch": 0.5097516040912268, + "grad_norm": 3.0150370597839355, + "learning_rate": 9.926372492256864e-06, + "loss": 1.356, + "step": 1003 + }, + { + "epoch": 0.510259831014548, + "grad_norm": 3.2294318675994873, + "learning_rate": 9.926084752811853e-06, + "loss": 1.276, + "step": 1004 + }, + { + "epoch": 0.5107680579378693, + "grad_norm": 2.965230703353882, + "learning_rate": 9.925796456400535e-06, + "loss": 1.2202, + "step": 1005 + }, + { + "epoch": 0.5112762848611905, + "grad_norm": 2.934131145477295, + "learning_rate": 9.92550760305551e-06, + "loss": 1.2714, + "step": 1006 + }, + { + "epoch": 0.5117845117845118, + "grad_norm": 3.065397262573242, + "learning_rate": 9.92521819280943e-06, + "loss": 1.2414, + "step": 1007 + }, + { + "epoch": 0.5122927387078331, + "grad_norm": 3.265735387802124, + "learning_rate": 9.924928225695026e-06, + "loss": 1.2842, + "step": 1008 + }, + { + "epoch": 0.5128009656311543, + "grad_norm": 3.2375340461730957, + "learning_rate": 9.924637701745075e-06, + "loss": 1.1905, + "step": 1009 + }, + { + "epoch": 0.5133091925544756, + "grad_norm": 3.048048257827759, + "learning_rate": 9.924346620992429e-06, + "loss": 1.3127, + "step": 1010 + }, + { + "epoch": 0.5138174194777968, + "grad_norm": 2.9338512420654297, + "learning_rate": 9.924054983469999e-06, + "loss": 1.173, + "step": 1011 + }, + { + "epoch": 0.5143256464011181, + "grad_norm": 2.960909366607666, + "learning_rate": 9.923762789210757e-06, + "loss": 1.2117, + "step": 1012 + }, + { + "epoch": 0.5148338733244394, + "grad_norm": 2.8854153156280518, + "learning_rate": 9.923470038247741e-06, + "loss": 1.1573, + "step": 1013 + }, + { + "epoch": 0.5153421002477606, + "grad_norm": 3.157883644104004, + "learning_rate": 9.923176730614052e-06, + "loss": 1.2489, + "step": 1014 + }, + { + "epoch": 0.5158503271710819, + "grad_norm": 3.11163067817688, + "learning_rate": 9.92288286634285e-06, + "loss": 1.3366, + "step": 1015 + }, + { + "epoch": 0.5163585540944031, + "grad_norm": 3.2269506454467773, + "learning_rate": 9.922588445467362e-06, + "loss": 1.41, + "step": 1016 + }, + { + "epoch": 0.5168667810177244, + "grad_norm": 3.2312417030334473, + "learning_rate": 9.92229346802088e-06, + "loss": 1.332, + "step": 1017 + }, + { + "epoch": 0.5173750079410456, + "grad_norm": 3.2907750606536865, + "learning_rate": 9.921997934036749e-06, + "loss": 1.2556, + "step": 1018 + }, + { + "epoch": 0.5178832348643669, + "grad_norm": 2.9131078720092773, + "learning_rate": 9.921701843548389e-06, + "loss": 1.3176, + "step": 1019 + }, + { + "epoch": 0.5183914617876882, + "grad_norm": 3.4000084400177, + "learning_rate": 9.921405196589273e-06, + "loss": 1.2849, + "step": 1020 + }, + { + "epoch": 0.5188996887110094, + "grad_norm": 3.0663211345672607, + "learning_rate": 9.921107993192946e-06, + "loss": 1.2214, + "step": 1021 + }, + { + "epoch": 0.5194079156343308, + "grad_norm": 2.9851553440093994, + "learning_rate": 9.920810233393007e-06, + "loss": 1.1617, + "step": 1022 + }, + { + "epoch": 0.519916142557652, + "grad_norm": 3.3432230949401855, + "learning_rate": 9.920511917223125e-06, + "loss": 1.2762, + "step": 1023 + }, + { + "epoch": 0.5204243694809733, + "grad_norm": 3.3022565841674805, + "learning_rate": 9.920213044717027e-06, + "loss": 1.3154, + "step": 1024 + }, + { + "epoch": 0.5209325964042946, + "grad_norm": 3.4665110111236572, + "learning_rate": 9.919913615908505e-06, + "loss": 1.2879, + "step": 1025 + }, + { + "epoch": 0.5214408233276158, + "grad_norm": 3.0947935581207275, + "learning_rate": 9.919613630831416e-06, + "loss": 1.2294, + "step": 1026 + }, + { + "epoch": 0.5219490502509371, + "grad_norm": 3.237161874771118, + "learning_rate": 9.919313089519677e-06, + "loss": 1.2859, + "step": 1027 + }, + { + "epoch": 0.5224572771742583, + "grad_norm": 3.29890775680542, + "learning_rate": 9.919011992007266e-06, + "loss": 1.2226, + "step": 1028 + }, + { + "epoch": 0.5229655040975796, + "grad_norm": 3.400012969970703, + "learning_rate": 9.91871033832823e-06, + "loss": 1.3052, + "step": 1029 + }, + { + "epoch": 0.5234737310209008, + "grad_norm": 3.583190679550171, + "learning_rate": 9.918408128516674e-06, + "loss": 1.3402, + "step": 1030 + }, + { + "epoch": 0.5239819579442221, + "grad_norm": 3.0629453659057617, + "learning_rate": 9.918105362606766e-06, + "loss": 1.258, + "step": 1031 + }, + { + "epoch": 0.5244901848675434, + "grad_norm": 3.27661395072937, + "learning_rate": 9.91780204063274e-06, + "loss": 1.4624, + "step": 1032 + }, + { + "epoch": 0.5249984117908646, + "grad_norm": 3.9633708000183105, + "learning_rate": 9.917498162628888e-06, + "loss": 1.2498, + "step": 1033 + }, + { + "epoch": 0.5255066387141859, + "grad_norm": 3.0484509468078613, + "learning_rate": 9.917193728629574e-06, + "loss": 1.2621, + "step": 1034 + }, + { + "epoch": 0.5260148656375071, + "grad_norm": 3.034428596496582, + "learning_rate": 9.916888738669212e-06, + "loss": 1.2793, + "step": 1035 + }, + { + "epoch": 0.5265230925608284, + "grad_norm": 3.1338136196136475, + "learning_rate": 9.91658319278229e-06, + "loss": 1.3162, + "step": 1036 + }, + { + "epoch": 0.5270313194841497, + "grad_norm": 3.1185007095336914, + "learning_rate": 9.916277091003352e-06, + "loss": 1.2203, + "step": 1037 + }, + { + "epoch": 0.5275395464074709, + "grad_norm": 3.052046060562134, + "learning_rate": 9.915970433367009e-06, + "loss": 1.2556, + "step": 1038 + }, + { + "epoch": 0.5280477733307922, + "grad_norm": 3.055419921875, + "learning_rate": 9.915663219907933e-06, + "loss": 1.2842, + "step": 1039 + }, + { + "epoch": 0.5285560002541134, + "grad_norm": 3.175314426422119, + "learning_rate": 9.915355450660858e-06, + "loss": 1.2761, + "step": 1040 + }, + { + "epoch": 0.5290642271774347, + "grad_norm": 2.6530027389526367, + "learning_rate": 9.915047125660581e-06, + "loss": 1.2134, + "step": 1041 + }, + { + "epoch": 0.529572454100756, + "grad_norm": 3.3357229232788086, + "learning_rate": 9.914738244941965e-06, + "loss": 1.3765, + "step": 1042 + }, + { + "epoch": 0.5300806810240772, + "grad_norm": 2.9852263927459717, + "learning_rate": 9.91442880853993e-06, + "loss": 1.2577, + "step": 1043 + }, + { + "epoch": 0.5305889079473985, + "grad_norm": 2.864121913909912, + "learning_rate": 9.914118816489469e-06, + "loss": 1.3375, + "step": 1044 + }, + { + "epoch": 0.5310971348707197, + "grad_norm": 2.9069125652313232, + "learning_rate": 9.913808268825625e-06, + "loss": 1.2162, + "step": 1045 + }, + { + "epoch": 0.531605361794041, + "grad_norm": 3.2001500129699707, + "learning_rate": 9.91349716558351e-06, + "loss": 1.2921, + "step": 1046 + }, + { + "epoch": 0.5321135887173623, + "grad_norm": 2.888265371322632, + "learning_rate": 9.913185506798302e-06, + "loss": 1.1466, + "step": 1047 + }, + { + "epoch": 0.5326218156406836, + "grad_norm": 3.1221208572387695, + "learning_rate": 9.912873292505238e-06, + "loss": 1.2126, + "step": 1048 + }, + { + "epoch": 0.5331300425640049, + "grad_norm": 3.3143773078918457, + "learning_rate": 9.912560522739618e-06, + "loss": 1.3249, + "step": 1049 + }, + { + "epoch": 0.5336382694873261, + "grad_norm": 3.1017792224884033, + "learning_rate": 9.912247197536804e-06, + "loss": 1.3083, + "step": 1050 + }, + { + "epoch": 0.5341464964106474, + "grad_norm": 2.9904158115386963, + "learning_rate": 9.911933316932223e-06, + "loss": 1.2244, + "step": 1051 + }, + { + "epoch": 0.5346547233339686, + "grad_norm": 3.5156807899475098, + "learning_rate": 9.911618880961365e-06, + "loss": 1.3113, + "step": 1052 + }, + { + "epoch": 0.5351629502572899, + "grad_norm": 3.0118355751037598, + "learning_rate": 9.91130388965978e-06, + "loss": 1.2591, + "step": 1053 + }, + { + "epoch": 0.5356711771806112, + "grad_norm": 3.93129301071167, + "learning_rate": 9.910988343063081e-06, + "loss": 1.3097, + "step": 1054 + }, + { + "epoch": 0.5361794041039324, + "grad_norm": 2.846911668777466, + "learning_rate": 9.910672241206948e-06, + "loss": 1.1875, + "step": 1055 + }, + { + "epoch": 0.5366876310272537, + "grad_norm": 2.836031913757324, + "learning_rate": 9.91035558412712e-06, + "loss": 1.302, + "step": 1056 + }, + { + "epoch": 0.5371958579505749, + "grad_norm": 3.446367025375366, + "learning_rate": 9.910038371859399e-06, + "loss": 1.327, + "step": 1057 + }, + { + "epoch": 0.5377040848738962, + "grad_norm": 2.8755125999450684, + "learning_rate": 9.909720604439652e-06, + "loss": 1.2768, + "step": 1058 + }, + { + "epoch": 0.5382123117972175, + "grad_norm": 2.974616765975952, + "learning_rate": 9.909402281903808e-06, + "loss": 1.3633, + "step": 1059 + }, + { + "epoch": 0.5387205387205387, + "grad_norm": 3.0021567344665527, + "learning_rate": 9.909083404287853e-06, + "loss": 1.3469, + "step": 1060 + }, + { + "epoch": 0.53922876564386, + "grad_norm": 2.866323709487915, + "learning_rate": 9.908763971627846e-06, + "loss": 1.2739, + "step": 1061 + }, + { + "epoch": 0.5397369925671812, + "grad_norm": 3.079787254333496, + "learning_rate": 9.908443983959903e-06, + "loss": 1.2476, + "step": 1062 + }, + { + "epoch": 0.5402452194905025, + "grad_norm": 2.970996141433716, + "learning_rate": 9.9081234413202e-06, + "loss": 1.2822, + "step": 1063 + }, + { + "epoch": 0.5407534464138237, + "grad_norm": 3.0350842475891113, + "learning_rate": 9.907802343744983e-06, + "loss": 1.2566, + "step": 1064 + }, + { + "epoch": 0.541261673337145, + "grad_norm": 2.901156425476074, + "learning_rate": 9.907480691270554e-06, + "loss": 1.2111, + "step": 1065 + }, + { + "epoch": 0.5417699002604663, + "grad_norm": 3.4042131900787354, + "learning_rate": 9.907158483933283e-06, + "loss": 1.388, + "step": 1066 + }, + { + "epoch": 0.5422781271837875, + "grad_norm": 2.9463226795196533, + "learning_rate": 9.906835721769597e-06, + "loss": 1.1387, + "step": 1067 + }, + { + "epoch": 0.5427863541071088, + "grad_norm": 2.950364589691162, + "learning_rate": 9.90651240481599e-06, + "loss": 1.2743, + "step": 1068 + }, + { + "epoch": 0.54329458103043, + "grad_norm": 3.0166707038879395, + "learning_rate": 9.906188533109022e-06, + "loss": 1.2999, + "step": 1069 + }, + { + "epoch": 0.5438028079537514, + "grad_norm": 2.9995715618133545, + "learning_rate": 9.905864106685305e-06, + "loss": 1.3692, + "step": 1070 + }, + { + "epoch": 0.5443110348770727, + "grad_norm": 2.8355770111083984, + "learning_rate": 9.905539125581525e-06, + "loss": 1.222, + "step": 1071 + }, + { + "epoch": 0.5448192618003939, + "grad_norm": 3.0823659896850586, + "learning_rate": 9.905213589834424e-06, + "loss": 1.2928, + "step": 1072 + }, + { + "epoch": 0.5453274887237152, + "grad_norm": 3.1366348266601562, + "learning_rate": 9.90488749948081e-06, + "loss": 1.2437, + "step": 1073 + }, + { + "epoch": 0.5458357156470364, + "grad_norm": 3.1095762252807617, + "learning_rate": 9.904560854557548e-06, + "loss": 1.2076, + "step": 1074 + }, + { + "epoch": 0.5463439425703577, + "grad_norm": 2.9151086807250977, + "learning_rate": 9.904233655101574e-06, + "loss": 1.2691, + "step": 1075 + }, + { + "epoch": 0.5468521694936789, + "grad_norm": 2.994748830795288, + "learning_rate": 9.903905901149881e-06, + "loss": 1.2917, + "step": 1076 + }, + { + "epoch": 0.5473603964170002, + "grad_norm": 3.118807315826416, + "learning_rate": 9.903577592739528e-06, + "loss": 1.2359, + "step": 1077 + }, + { + "epoch": 0.5478686233403215, + "grad_norm": 3.042778253555298, + "learning_rate": 9.903248729907635e-06, + "loss": 1.283, + "step": 1078 + }, + { + "epoch": 0.5483768502636427, + "grad_norm": 2.8278987407684326, + "learning_rate": 9.902919312691384e-06, + "loss": 1.2585, + "step": 1079 + }, + { + "epoch": 0.548885077186964, + "grad_norm": 2.88580322265625, + "learning_rate": 9.902589341128019e-06, + "loss": 1.2512, + "step": 1080 + }, + { + "epoch": 0.5493933041102852, + "grad_norm": 3.03999400138855, + "learning_rate": 9.902258815254851e-06, + "loss": 1.2731, + "step": 1081 + }, + { + "epoch": 0.5499015310336065, + "grad_norm": 3.7839131355285645, + "learning_rate": 9.901927735109249e-06, + "loss": 1.3055, + "step": 1082 + }, + { + "epoch": 0.5504097579569278, + "grad_norm": 3.0038340091705322, + "learning_rate": 9.901596100728646e-06, + "loss": 1.2088, + "step": 1083 + }, + { + "epoch": 0.550917984880249, + "grad_norm": 3.1675291061401367, + "learning_rate": 9.90126391215054e-06, + "loss": 1.2438, + "step": 1084 + }, + { + "epoch": 0.5514262118035703, + "grad_norm": 3.0010335445404053, + "learning_rate": 9.900931169412488e-06, + "loss": 1.2682, + "step": 1085 + }, + { + "epoch": 0.5519344387268915, + "grad_norm": 2.973571300506592, + "learning_rate": 9.900597872552113e-06, + "loss": 1.283, + "step": 1086 + }, + { + "epoch": 0.5524426656502128, + "grad_norm": 3.2726941108703613, + "learning_rate": 9.9002640216071e-06, + "loss": 1.2838, + "step": 1087 + }, + { + "epoch": 0.5529508925735341, + "grad_norm": 3.167182207107544, + "learning_rate": 9.899929616615192e-06, + "loss": 1.2879, + "step": 1088 + }, + { + "epoch": 0.5534591194968553, + "grad_norm": 3.0281550884246826, + "learning_rate": 9.899594657614201e-06, + "loss": 1.1682, + "step": 1089 + }, + { + "epoch": 0.5539673464201766, + "grad_norm": 3.0986578464508057, + "learning_rate": 9.899259144641999e-06, + "loss": 1.3208, + "step": 1090 + }, + { + "epoch": 0.5544755733434978, + "grad_norm": 3.445312023162842, + "learning_rate": 9.89892307773652e-06, + "loss": 1.224, + "step": 1091 + }, + { + "epoch": 0.5549838002668191, + "grad_norm": 3.1991617679595947, + "learning_rate": 9.898586456935761e-06, + "loss": 1.3483, + "step": 1092 + }, + { + "epoch": 0.5554920271901403, + "grad_norm": 3.3592443466186523, + "learning_rate": 9.898249282277784e-06, + "loss": 1.3855, + "step": 1093 + }, + { + "epoch": 0.5560002541134617, + "grad_norm": 3.050511121749878, + "learning_rate": 9.897911553800709e-06, + "loss": 1.3756, + "step": 1094 + }, + { + "epoch": 0.556508481036783, + "grad_norm": 3.1178085803985596, + "learning_rate": 9.897573271542721e-06, + "loss": 1.3593, + "step": 1095 + }, + { + "epoch": 0.5570167079601042, + "grad_norm": 3.3286967277526855, + "learning_rate": 9.897234435542072e-06, + "loss": 1.2354, + "step": 1096 + }, + { + "epoch": 0.5575249348834255, + "grad_norm": 3.2614622116088867, + "learning_rate": 9.896895045837067e-06, + "loss": 1.3017, + "step": 1097 + }, + { + "epoch": 0.5580331618067467, + "grad_norm": 3.1033172607421875, + "learning_rate": 9.896555102466083e-06, + "loss": 1.3554, + "step": 1098 + }, + { + "epoch": 0.558541388730068, + "grad_norm": 3.0228354930877686, + "learning_rate": 9.896214605467553e-06, + "loss": 1.2444, + "step": 1099 + }, + { + "epoch": 0.5590496156533893, + "grad_norm": 2.8342230319976807, + "learning_rate": 9.895873554879978e-06, + "loss": 1.2475, + "step": 1100 + }, + { + "epoch": 0.5595578425767105, + "grad_norm": 3.0209481716156006, + "learning_rate": 9.895531950741915e-06, + "loss": 1.2892, + "step": 1101 + }, + { + "epoch": 0.5600660695000318, + "grad_norm": 2.9123499393463135, + "learning_rate": 9.89518979309199e-06, + "loss": 1.26, + "step": 1102 + }, + { + "epoch": 0.560574296423353, + "grad_norm": 2.979750394821167, + "learning_rate": 9.894847081968888e-06, + "loss": 1.2042, + "step": 1103 + }, + { + "epoch": 0.5610825233466743, + "grad_norm": 3.2477877140045166, + "learning_rate": 9.894503817411358e-06, + "loss": 1.553, + "step": 1104 + }, + { + "epoch": 0.5615907502699955, + "grad_norm": 3.2751965522766113, + "learning_rate": 9.89415999945821e-06, + "loss": 1.2902, + "step": 1105 + }, + { + "epoch": 0.5620989771933168, + "grad_norm": 3.260960578918457, + "learning_rate": 9.89381562814832e-06, + "loss": 1.2309, + "step": 1106 + }, + { + "epoch": 0.5626072041166381, + "grad_norm": 2.87548565864563, + "learning_rate": 9.893470703520622e-06, + "loss": 1.2196, + "step": 1107 + }, + { + "epoch": 0.5631154310399593, + "grad_norm": 3.0245654582977295, + "learning_rate": 9.893125225614117e-06, + "loss": 1.2439, + "step": 1108 + }, + { + "epoch": 0.5636236579632806, + "grad_norm": 2.7714860439300537, + "learning_rate": 9.892779194467864e-06, + "loss": 1.3271, + "step": 1109 + }, + { + "epoch": 0.5641318848866018, + "grad_norm": 2.8270699977874756, + "learning_rate": 9.892432610120987e-06, + "loss": 1.1949, + "step": 1110 + }, + { + "epoch": 0.5646401118099231, + "grad_norm": 3.2219133377075195, + "learning_rate": 9.892085472612675e-06, + "loss": 1.241, + "step": 1111 + }, + { + "epoch": 0.5651483387332444, + "grad_norm": 3.015878677368164, + "learning_rate": 9.891737781982174e-06, + "loss": 1.3107, + "step": 1112 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 3.113751173019409, + "learning_rate": 9.891389538268799e-06, + "loss": 1.3017, + "step": 1113 + }, + { + "epoch": 0.5661647925798869, + "grad_norm": 3.0058841705322266, + "learning_rate": 9.89104074151192e-06, + "loss": 1.2783, + "step": 1114 + }, + { + "epoch": 0.5666730195032081, + "grad_norm": 2.8917829990386963, + "learning_rate": 9.890691391750977e-06, + "loss": 1.2405, + "step": 1115 + }, + { + "epoch": 0.5671812464265295, + "grad_norm": 3.019864082336426, + "learning_rate": 9.890341489025466e-06, + "loss": 1.1901, + "step": 1116 + }, + { + "epoch": 0.5676894733498508, + "grad_norm": 2.9965898990631104, + "learning_rate": 9.889991033374952e-06, + "loss": 1.3086, + "step": 1117 + }, + { + "epoch": 0.568197700273172, + "grad_norm": 2.688847780227661, + "learning_rate": 9.889640024839057e-06, + "loss": 1.2379, + "step": 1118 + }, + { + "epoch": 0.5687059271964933, + "grad_norm": 3.068826198577881, + "learning_rate": 9.889288463457468e-06, + "loss": 1.2525, + "step": 1119 + }, + { + "epoch": 0.5692141541198145, + "grad_norm": 3.1524131298065186, + "learning_rate": 9.888936349269934e-06, + "loss": 1.2592, + "step": 1120 + }, + { + "epoch": 0.5697223810431358, + "grad_norm": 2.97160267829895, + "learning_rate": 9.888583682316268e-06, + "loss": 1.2293, + "step": 1121 + }, + { + "epoch": 0.570230607966457, + "grad_norm": 3.040951728820801, + "learning_rate": 9.888230462636343e-06, + "loss": 1.2587, + "step": 1122 + }, + { + "epoch": 0.5707388348897783, + "grad_norm": 3.0704641342163086, + "learning_rate": 9.887876690270095e-06, + "loss": 1.3122, + "step": 1123 + }, + { + "epoch": 0.5712470618130996, + "grad_norm": 3.068542242050171, + "learning_rate": 9.887522365257525e-06, + "loss": 1.3523, + "step": 1124 + }, + { + "epoch": 0.5717552887364208, + "grad_norm": 3.050361394882202, + "learning_rate": 9.887167487638693e-06, + "loss": 1.2626, + "step": 1125 + }, + { + "epoch": 0.5722635156597421, + "grad_norm": 3.1941027641296387, + "learning_rate": 9.886812057453726e-06, + "loss": 1.389, + "step": 1126 + }, + { + "epoch": 0.5727717425830633, + "grad_norm": 3.0776960849761963, + "learning_rate": 9.886456074742806e-06, + "loss": 1.2869, + "step": 1127 + }, + { + "epoch": 0.5732799695063846, + "grad_norm": 3.1108217239379883, + "learning_rate": 9.886099539546185e-06, + "loss": 1.2325, + "step": 1128 + }, + { + "epoch": 0.5737881964297059, + "grad_norm": 2.865870714187622, + "learning_rate": 9.885742451904174e-06, + "loss": 1.2044, + "step": 1129 + }, + { + "epoch": 0.5742964233530271, + "grad_norm": 2.8582499027252197, + "learning_rate": 9.885384811857148e-06, + "loss": 1.1932, + "step": 1130 + }, + { + "epoch": 0.5748046502763484, + "grad_norm": 3.5153896808624268, + "learning_rate": 9.885026619445544e-06, + "loss": 1.3823, + "step": 1131 + }, + { + "epoch": 0.5753128771996696, + "grad_norm": 2.8332269191741943, + "learning_rate": 9.884667874709857e-06, + "loss": 1.2556, + "step": 1132 + }, + { + "epoch": 0.5758211041229909, + "grad_norm": 2.7498703002929688, + "learning_rate": 9.88430857769065e-06, + "loss": 1.1875, + "step": 1133 + }, + { + "epoch": 0.5763293310463122, + "grad_norm": 2.9405388832092285, + "learning_rate": 9.883948728428551e-06, + "loss": 1.1411, + "step": 1134 + }, + { + "epoch": 0.5768375579696334, + "grad_norm": 2.9063611030578613, + "learning_rate": 9.883588326964242e-06, + "loss": 1.2758, + "step": 1135 + }, + { + "epoch": 0.5773457848929547, + "grad_norm": 3.066329002380371, + "learning_rate": 9.883227373338472e-06, + "loss": 1.2635, + "step": 1136 + }, + { + "epoch": 0.5778540118162759, + "grad_norm": 3.026329755783081, + "learning_rate": 9.882865867592054e-06, + "loss": 1.327, + "step": 1137 + }, + { + "epoch": 0.5783622387395972, + "grad_norm": 2.8590166568756104, + "learning_rate": 9.882503809765858e-06, + "loss": 1.2706, + "step": 1138 + }, + { + "epoch": 0.5788704656629184, + "grad_norm": 3.33844256401062, + "learning_rate": 9.882141199900823e-06, + "loss": 1.2434, + "step": 1139 + }, + { + "epoch": 0.5793786925862398, + "grad_norm": 2.910153865814209, + "learning_rate": 9.881778038037946e-06, + "loss": 1.2609, + "step": 1140 + }, + { + "epoch": 0.5798869195095611, + "grad_norm": 3.2438127994537354, + "learning_rate": 9.88141432421829e-06, + "loss": 1.308, + "step": 1141 + }, + { + "epoch": 0.5803951464328823, + "grad_norm": 3.1046183109283447, + "learning_rate": 9.881050058482976e-06, + "loss": 1.3514, + "step": 1142 + }, + { + "epoch": 0.5809033733562036, + "grad_norm": 2.9112555980682373, + "learning_rate": 9.88068524087319e-06, + "loss": 1.3074, + "step": 1143 + }, + { + "epoch": 0.5814116002795248, + "grad_norm": 3.073887586593628, + "learning_rate": 9.880319871430179e-06, + "loss": 1.219, + "step": 1144 + }, + { + "epoch": 0.5819198272028461, + "grad_norm": 2.8623321056365967, + "learning_rate": 9.879953950195255e-06, + "loss": 1.1971, + "step": 1145 + }, + { + "epoch": 0.5824280541261674, + "grad_norm": 2.9542438983917236, + "learning_rate": 9.879587477209793e-06, + "loss": 1.2554, + "step": 1146 + }, + { + "epoch": 0.5829362810494886, + "grad_norm": 3.502727508544922, + "learning_rate": 9.879220452515224e-06, + "loss": 1.254, + "step": 1147 + }, + { + "epoch": 0.5834445079728099, + "grad_norm": 2.9458866119384766, + "learning_rate": 9.878852876153047e-06, + "loss": 1.2976, + "step": 1148 + }, + { + "epoch": 0.5839527348961311, + "grad_norm": 3.059884786605835, + "learning_rate": 9.87848474816482e-06, + "loss": 1.3303, + "step": 1149 + }, + { + "epoch": 0.5844609618194524, + "grad_norm": 2.8677780628204346, + "learning_rate": 9.878116068592169e-06, + "loss": 1.2808, + "step": 1150 + }, + { + "epoch": 0.5849691887427736, + "grad_norm": 3.375119209289551, + "learning_rate": 9.877746837476777e-06, + "loss": 1.2412, + "step": 1151 + }, + { + "epoch": 0.5854774156660949, + "grad_norm": 3.057594060897827, + "learning_rate": 9.877377054860391e-06, + "loss": 1.2625, + "step": 1152 + }, + { + "epoch": 0.5859856425894162, + "grad_norm": 3.1959619522094727, + "learning_rate": 9.87700672078482e-06, + "loss": 1.306, + "step": 1153 + }, + { + "epoch": 0.5864938695127374, + "grad_norm": 2.947911262512207, + "learning_rate": 9.876635835291936e-06, + "loss": 1.2275, + "step": 1154 + }, + { + "epoch": 0.5870020964360587, + "grad_norm": 4.026703834533691, + "learning_rate": 9.876264398423672e-06, + "loss": 1.3709, + "step": 1155 + }, + { + "epoch": 0.5875103233593799, + "grad_norm": 2.906632661819458, + "learning_rate": 9.875892410222027e-06, + "loss": 1.3088, + "step": 1156 + }, + { + "epoch": 0.5880185502827012, + "grad_norm": 2.9481449127197266, + "learning_rate": 9.875519870729057e-06, + "loss": 1.3556, + "step": 1157 + }, + { + "epoch": 0.5885267772060225, + "grad_norm": 2.9592795372009277, + "learning_rate": 9.875146779986885e-06, + "loss": 1.2336, + "step": 1158 + }, + { + "epoch": 0.5890350041293437, + "grad_norm": 2.996302604675293, + "learning_rate": 9.874773138037693e-06, + "loss": 1.2626, + "step": 1159 + }, + { + "epoch": 0.589543231052665, + "grad_norm": 2.965101480484009, + "learning_rate": 9.874398944923728e-06, + "loss": 1.3835, + "step": 1160 + }, + { + "epoch": 0.5900514579759862, + "grad_norm": 2.9105746746063232, + "learning_rate": 9.874024200687297e-06, + "loss": 1.1297, + "step": 1161 + }, + { + "epoch": 0.5905596848993075, + "grad_norm": 2.9277119636535645, + "learning_rate": 9.873648905370769e-06, + "loss": 1.1621, + "step": 1162 + }, + { + "epoch": 0.5910679118226289, + "grad_norm": 3.346733808517456, + "learning_rate": 9.873273059016582e-06, + "loss": 1.3174, + "step": 1163 + }, + { + "epoch": 0.59157613874595, + "grad_norm": 3.2384955883026123, + "learning_rate": 9.872896661667224e-06, + "loss": 1.2219, + "step": 1164 + }, + { + "epoch": 0.5920843656692714, + "grad_norm": 2.9235384464263916, + "learning_rate": 9.872519713365259e-06, + "loss": 1.264, + "step": 1165 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 3.221442222595215, + "learning_rate": 9.8721422141533e-06, + "loss": 1.1639, + "step": 1166 + }, + { + "epoch": 0.5931008195159139, + "grad_norm": 2.9388232231140137, + "learning_rate": 9.871764164074033e-06, + "loss": 1.216, + "step": 1167 + }, + { + "epoch": 0.5936090464392351, + "grad_norm": 3.0020532608032227, + "learning_rate": 9.871385563170201e-06, + "loss": 1.2731, + "step": 1168 + }, + { + "epoch": 0.5941172733625564, + "grad_norm": 3.0851593017578125, + "learning_rate": 9.87100641148461e-06, + "loss": 1.1301, + "step": 1169 + }, + { + "epoch": 0.5946255002858777, + "grad_norm": 2.9967799186706543, + "learning_rate": 9.870626709060131e-06, + "loss": 1.22, + "step": 1170 + }, + { + "epoch": 0.5951337272091989, + "grad_norm": 3.1237094402313232, + "learning_rate": 9.870246455939692e-06, + "loss": 1.2942, + "step": 1171 + }, + { + "epoch": 0.5956419541325202, + "grad_norm": 3.2442684173583984, + "learning_rate": 9.869865652166287e-06, + "loss": 1.2948, + "step": 1172 + }, + { + "epoch": 0.5961501810558414, + "grad_norm": 3.2860963344573975, + "learning_rate": 9.869484297782971e-06, + "loss": 1.3071, + "step": 1173 + }, + { + "epoch": 0.5966584079791627, + "grad_norm": 2.9791018962860107, + "learning_rate": 9.869102392832863e-06, + "loss": 1.2806, + "step": 1174 + }, + { + "epoch": 0.597166634902484, + "grad_norm": 2.7118618488311768, + "learning_rate": 9.868719937359144e-06, + "loss": 1.2168, + "step": 1175 + }, + { + "epoch": 0.5976748618258052, + "grad_norm": 2.7597343921661377, + "learning_rate": 9.868336931405054e-06, + "loss": 1.2258, + "step": 1176 + }, + { + "epoch": 0.5981830887491265, + "grad_norm": 3.0382118225097656, + "learning_rate": 9.867953375013897e-06, + "loss": 1.3343, + "step": 1177 + }, + { + "epoch": 0.5986913156724477, + "grad_norm": 3.269522190093994, + "learning_rate": 9.86756926822904e-06, + "loss": 1.2483, + "step": 1178 + }, + { + "epoch": 0.599199542595769, + "grad_norm": 2.7839956283569336, + "learning_rate": 9.867184611093914e-06, + "loss": 1.2309, + "step": 1179 + }, + { + "epoch": 0.5997077695190903, + "grad_norm": 2.8881192207336426, + "learning_rate": 9.86679940365201e-06, + "loss": 1.2939, + "step": 1180 + }, + { + "epoch": 0.6002159964424115, + "grad_norm": 2.9655847549438477, + "learning_rate": 9.86641364594688e-06, + "loss": 1.2051, + "step": 1181 + }, + { + "epoch": 0.6007242233657328, + "grad_norm": 3.159656047821045, + "learning_rate": 9.866027338022139e-06, + "loss": 1.3687, + "step": 1182 + }, + { + "epoch": 0.601232450289054, + "grad_norm": 3.0268661975860596, + "learning_rate": 9.865640479921465e-06, + "loss": 1.218, + "step": 1183 + }, + { + "epoch": 0.6017406772123753, + "grad_norm": 3.583407402038574, + "learning_rate": 9.865253071688598e-06, + "loss": 1.2427, + "step": 1184 + }, + { + "epoch": 0.6022489041356965, + "grad_norm": 3.025599718093872, + "learning_rate": 9.864865113367344e-06, + "loss": 1.2514, + "step": 1185 + }, + { + "epoch": 0.6027571310590178, + "grad_norm": 2.75777006149292, + "learning_rate": 9.864476605001561e-06, + "loss": 1.2296, + "step": 1186 + }, + { + "epoch": 0.6032653579823392, + "grad_norm": 2.9044742584228516, + "learning_rate": 9.864087546635181e-06, + "loss": 1.2544, + "step": 1187 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 3.1498332023620605, + "learning_rate": 9.86369793831219e-06, + "loss": 1.3202, + "step": 1188 + }, + { + "epoch": 0.6042818118289817, + "grad_norm": 3.185675859451294, + "learning_rate": 9.863307780076638e-06, + "loss": 1.2586, + "step": 1189 + }, + { + "epoch": 0.6047900387523029, + "grad_norm": 3.4412953853607178, + "learning_rate": 9.86291707197264e-06, + "loss": 1.3381, + "step": 1190 + }, + { + "epoch": 0.6052982656756242, + "grad_norm": 3.0474026203155518, + "learning_rate": 9.862525814044373e-06, + "loss": 1.2852, + "step": 1191 + }, + { + "epoch": 0.6058064925989455, + "grad_norm": 2.7538821697235107, + "learning_rate": 9.86213400633607e-06, + "loss": 1.2725, + "step": 1192 + }, + { + "epoch": 0.6063147195222667, + "grad_norm": 3.0935001373291016, + "learning_rate": 9.861741648892035e-06, + "loss": 1.2087, + "step": 1193 + }, + { + "epoch": 0.606822946445588, + "grad_norm": 2.796851396560669, + "learning_rate": 9.861348741756626e-06, + "loss": 1.2487, + "step": 1194 + }, + { + "epoch": 0.6073311733689092, + "grad_norm": 3.0847465991973877, + "learning_rate": 9.86095528497427e-06, + "loss": 1.2479, + "step": 1195 + }, + { + "epoch": 0.6078394002922305, + "grad_norm": 2.979198932647705, + "learning_rate": 9.860561278589452e-06, + "loss": 1.2393, + "step": 1196 + }, + { + "epoch": 0.6083476272155517, + "grad_norm": 3.056978464126587, + "learning_rate": 9.860166722646718e-06, + "loss": 1.1733, + "step": 1197 + }, + { + "epoch": 0.608855854138873, + "grad_norm": 2.78646183013916, + "learning_rate": 9.859771617190681e-06, + "loss": 1.2877, + "step": 1198 + }, + { + "epoch": 0.6093640810621943, + "grad_norm": 2.911860704421997, + "learning_rate": 9.859375962266014e-06, + "loss": 1.2914, + "step": 1199 + }, + { + "epoch": 0.6098723079855155, + "grad_norm": 2.7991490364074707, + "learning_rate": 9.85897975791745e-06, + "loss": 1.2194, + "step": 1200 + }, + { + "epoch": 0.6103805349088368, + "grad_norm": 2.8022921085357666, + "learning_rate": 9.858583004189785e-06, + "loss": 1.2472, + "step": 1201 + }, + { + "epoch": 0.610888761832158, + "grad_norm": 3.0368905067443848, + "learning_rate": 9.85818570112788e-06, + "loss": 1.3095, + "step": 1202 + }, + { + "epoch": 0.6113969887554793, + "grad_norm": 2.757432460784912, + "learning_rate": 9.857787848776656e-06, + "loss": 1.1634, + "step": 1203 + }, + { + "epoch": 0.6119052156788006, + "grad_norm": 3.2205071449279785, + "learning_rate": 9.857389447181093e-06, + "loss": 1.2799, + "step": 1204 + }, + { + "epoch": 0.6124134426021218, + "grad_norm": 3.149803876876831, + "learning_rate": 9.85699049638624e-06, + "loss": 1.312, + "step": 1205 + }, + { + "epoch": 0.6129216695254431, + "grad_norm": 2.9970386028289795, + "learning_rate": 9.8565909964372e-06, + "loss": 1.2576, + "step": 1206 + }, + { + "epoch": 0.6134298964487643, + "grad_norm": 3.1370797157287598, + "learning_rate": 9.856190947379148e-06, + "loss": 1.3491, + "step": 1207 + }, + { + "epoch": 0.6139381233720856, + "grad_norm": 3.0502049922943115, + "learning_rate": 9.855790349257311e-06, + "loss": 1.1822, + "step": 1208 + }, + { + "epoch": 0.614446350295407, + "grad_norm": 3.278427839279175, + "learning_rate": 9.855389202116983e-06, + "loss": 1.2727, + "step": 1209 + }, + { + "epoch": 0.6149545772187281, + "grad_norm": 3.1668384075164795, + "learning_rate": 9.85498750600352e-06, + "loss": 1.3367, + "step": 1210 + }, + { + "epoch": 0.6154628041420495, + "grad_norm": 2.8745815753936768, + "learning_rate": 9.85458526096234e-06, + "loss": 1.2038, + "step": 1211 + }, + { + "epoch": 0.6159710310653707, + "grad_norm": 2.781729221343994, + "learning_rate": 9.854182467038922e-06, + "loss": 1.224, + "step": 1212 + }, + { + "epoch": 0.616479257988692, + "grad_norm": 2.9090940952301025, + "learning_rate": 9.85377912427881e-06, + "loss": 1.2572, + "step": 1213 + }, + { + "epoch": 0.6169874849120132, + "grad_norm": 2.9433419704437256, + "learning_rate": 9.853375232727606e-06, + "loss": 1.1687, + "step": 1214 + }, + { + "epoch": 0.6174957118353345, + "grad_norm": 3.9726810455322266, + "learning_rate": 9.852970792430976e-06, + "loss": 1.1999, + "step": 1215 + }, + { + "epoch": 0.6180039387586558, + "grad_norm": 3.0864198207855225, + "learning_rate": 9.852565803434649e-06, + "loss": 1.2704, + "step": 1216 + }, + { + "epoch": 0.618512165681977, + "grad_norm": 2.8298897743225098, + "learning_rate": 9.852160265784411e-06, + "loss": 1.2681, + "step": 1217 + }, + { + "epoch": 0.6190203926052983, + "grad_norm": 2.9570887088775635, + "learning_rate": 9.851754179526118e-06, + "loss": 1.1922, + "step": 1218 + }, + { + "epoch": 0.6195286195286195, + "grad_norm": 2.864625930786133, + "learning_rate": 9.851347544705686e-06, + "loss": 1.2429, + "step": 1219 + }, + { + "epoch": 0.6200368464519408, + "grad_norm": 2.9287493228912354, + "learning_rate": 9.850940361369085e-06, + "loss": 1.1807, + "step": 1220 + }, + { + "epoch": 0.6205450733752621, + "grad_norm": 3.0884289741516113, + "learning_rate": 9.850532629562357e-06, + "loss": 1.3063, + "step": 1221 + }, + { + "epoch": 0.6210533002985833, + "grad_norm": 2.916370153427124, + "learning_rate": 9.850124349331602e-06, + "loss": 1.3281, + "step": 1222 + }, + { + "epoch": 0.6215615272219046, + "grad_norm": 2.9838948249816895, + "learning_rate": 9.84971552072298e-06, + "loss": 1.2799, + "step": 1223 + }, + { + "epoch": 0.6220697541452258, + "grad_norm": 2.813861846923828, + "learning_rate": 9.849306143782717e-06, + "loss": 1.2931, + "step": 1224 + }, + { + "epoch": 0.6225779810685471, + "grad_norm": 2.860564708709717, + "learning_rate": 9.848896218557098e-06, + "loss": 1.2828, + "step": 1225 + }, + { + "epoch": 0.6230862079918684, + "grad_norm": 2.733185291290283, + "learning_rate": 9.848485745092472e-06, + "loss": 1.1781, + "step": 1226 + }, + { + "epoch": 0.6235944349151896, + "grad_norm": 4.069754600524902, + "learning_rate": 9.848074723435248e-06, + "loss": 1.2646, + "step": 1227 + }, + { + "epoch": 0.6241026618385109, + "grad_norm": 2.9285528659820557, + "learning_rate": 9.8476631536319e-06, + "loss": 1.3353, + "step": 1228 + }, + { + "epoch": 0.6246108887618321, + "grad_norm": 2.9530718326568604, + "learning_rate": 9.84725103572896e-06, + "loss": 1.2233, + "step": 1229 + }, + { + "epoch": 0.6251191156851534, + "grad_norm": 2.9010536670684814, + "learning_rate": 9.846838369773024e-06, + "loss": 1.304, + "step": 1230 + }, + { + "epoch": 0.6256273426084746, + "grad_norm": 2.8730621337890625, + "learning_rate": 9.84642515581075e-06, + "loss": 1.2007, + "step": 1231 + }, + { + "epoch": 0.6261355695317959, + "grad_norm": 3.3889389038085938, + "learning_rate": 9.84601139388886e-06, + "loss": 1.3055, + "step": 1232 + }, + { + "epoch": 0.6266437964551173, + "grad_norm": 2.939222812652588, + "learning_rate": 9.845597084054135e-06, + "loss": 1.1747, + "step": 1233 + }, + { + "epoch": 0.6271520233784385, + "grad_norm": 3.0841636657714844, + "learning_rate": 9.845182226353415e-06, + "loss": 1.3309, + "step": 1234 + }, + { + "epoch": 0.6276602503017598, + "grad_norm": 3.2949295043945312, + "learning_rate": 9.844766820833613e-06, + "loss": 1.3251, + "step": 1235 + }, + { + "epoch": 0.628168477225081, + "grad_norm": 2.994581699371338, + "learning_rate": 9.84435086754169e-06, + "loss": 1.4239, + "step": 1236 + }, + { + "epoch": 0.6286767041484023, + "grad_norm": 2.904791831970215, + "learning_rate": 9.843934366524679e-06, + "loss": 1.1277, + "step": 1237 + }, + { + "epoch": 0.6291849310717236, + "grad_norm": 2.857452630996704, + "learning_rate": 9.843517317829672e-06, + "loss": 1.2775, + "step": 1238 + }, + { + "epoch": 0.6296931579950448, + "grad_norm": 3.0897974967956543, + "learning_rate": 9.84309972150382e-06, + "loss": 1.4043, + "step": 1239 + }, + { + "epoch": 0.6302013849183661, + "grad_norm": 2.9603357315063477, + "learning_rate": 9.84268157759434e-06, + "loss": 1.2107, + "step": 1240 + }, + { + "epoch": 0.6307096118416873, + "grad_norm": 3.1953182220458984, + "learning_rate": 9.842262886148509e-06, + "loss": 1.292, + "step": 1241 + }, + { + "epoch": 0.6312178387650086, + "grad_norm": 3.0074422359466553, + "learning_rate": 9.841843647213664e-06, + "loss": 1.3658, + "step": 1242 + }, + { + "epoch": 0.6317260656883298, + "grad_norm": 3.2771244049072266, + "learning_rate": 9.84142386083721e-06, + "loss": 1.2754, + "step": 1243 + }, + { + "epoch": 0.6322342926116511, + "grad_norm": 2.9563822746276855, + "learning_rate": 9.84100352706661e-06, + "loss": 1.2131, + "step": 1244 + }, + { + "epoch": 0.6327425195349724, + "grad_norm": 2.826014995574951, + "learning_rate": 9.840582645949388e-06, + "loss": 1.1562, + "step": 1245 + }, + { + "epoch": 0.6332507464582936, + "grad_norm": 2.9703335762023926, + "learning_rate": 9.840161217533129e-06, + "loss": 1.4529, + "step": 1246 + }, + { + "epoch": 0.6337589733816149, + "grad_norm": 2.9779446125030518, + "learning_rate": 9.83973924186548e-06, + "loss": 1.2196, + "step": 1247 + }, + { + "epoch": 0.6342672003049361, + "grad_norm": 2.989461898803711, + "learning_rate": 9.839316718994159e-06, + "loss": 1.2317, + "step": 1248 + }, + { + "epoch": 0.6347754272282574, + "grad_norm": 3.122593402862549, + "learning_rate": 9.838893648966931e-06, + "loss": 1.2885, + "step": 1249 + }, + { + "epoch": 0.6352836541515787, + "grad_norm": 2.9813296794891357, + "learning_rate": 9.838470031831632e-06, + "loss": 1.2475, + "step": 1250 + }, + { + "epoch": 0.6357918810748999, + "grad_norm": 3.026923894882202, + "learning_rate": 9.838045867636163e-06, + "loss": 1.2436, + "step": 1251 + }, + { + "epoch": 0.6363001079982212, + "grad_norm": 2.8064677715301514, + "learning_rate": 9.837621156428476e-06, + "loss": 1.2575, + "step": 1252 + }, + { + "epoch": 0.6368083349215424, + "grad_norm": 3.0424234867095947, + "learning_rate": 9.837195898256593e-06, + "loss": 1.288, + "step": 1253 + }, + { + "epoch": 0.6373165618448637, + "grad_norm": 2.877368688583374, + "learning_rate": 9.836770093168595e-06, + "loss": 1.2892, + "step": 1254 + }, + { + "epoch": 0.637824788768185, + "grad_norm": 3.133418560028076, + "learning_rate": 9.836343741212628e-06, + "loss": 1.3596, + "step": 1255 + }, + { + "epoch": 0.6383330156915062, + "grad_norm": 9.114967346191406, + "learning_rate": 9.835916842436895e-06, + "loss": 1.3345, + "step": 1256 + }, + { + "epoch": 0.6388412426148276, + "grad_norm": 3.0029051303863525, + "learning_rate": 9.835489396889663e-06, + "loss": 1.2896, + "step": 1257 + }, + { + "epoch": 0.6393494695381488, + "grad_norm": 3.1740221977233887, + "learning_rate": 9.835061404619263e-06, + "loss": 1.2226, + "step": 1258 + }, + { + "epoch": 0.6398576964614701, + "grad_norm": 3.1588032245635986, + "learning_rate": 9.834632865674084e-06, + "loss": 1.2797, + "step": 1259 + }, + { + "epoch": 0.6403659233847913, + "grad_norm": 2.870164394378662, + "learning_rate": 9.834203780102579e-06, + "loss": 1.3561, + "step": 1260 + }, + { + "epoch": 0.6408741503081126, + "grad_norm": 3.0878357887268066, + "learning_rate": 9.833774147953264e-06, + "loss": 1.3606, + "step": 1261 + }, + { + "epoch": 0.6413823772314339, + "grad_norm": 2.916350841522217, + "learning_rate": 9.833343969274712e-06, + "loss": 1.2902, + "step": 1262 + }, + { + "epoch": 0.6418906041547551, + "grad_norm": 3.019193172454834, + "learning_rate": 9.832913244115565e-06, + "loss": 1.3008, + "step": 1263 + }, + { + "epoch": 0.6423988310780764, + "grad_norm": 3.3435311317443848, + "learning_rate": 9.83248197252452e-06, + "loss": 1.2686, + "step": 1264 + }, + { + "epoch": 0.6429070580013976, + "grad_norm": 2.869995594024658, + "learning_rate": 9.832050154550338e-06, + "loss": 1.1683, + "step": 1265 + }, + { + "epoch": 0.6434152849247189, + "grad_norm": 2.8468031883239746, + "learning_rate": 9.831617790241845e-06, + "loss": 1.2572, + "step": 1266 + }, + { + "epoch": 0.6439235118480402, + "grad_norm": 2.917226552963257, + "learning_rate": 9.831184879647927e-06, + "loss": 1.3825, + "step": 1267 + }, + { + "epoch": 0.6444317387713614, + "grad_norm": 3.3933417797088623, + "learning_rate": 9.830751422817526e-06, + "loss": 1.3198, + "step": 1268 + }, + { + "epoch": 0.6449399656946827, + "grad_norm": 2.893857717514038, + "learning_rate": 9.830317419799654e-06, + "loss": 1.2115, + "step": 1269 + }, + { + "epoch": 0.6454481926180039, + "grad_norm": 3.2240967750549316, + "learning_rate": 9.82988287064338e-06, + "loss": 1.3072, + "step": 1270 + }, + { + "epoch": 0.6459564195413252, + "grad_norm": 2.896242141723633, + "learning_rate": 9.829447775397837e-06, + "loss": 1.3173, + "step": 1271 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 3.0197970867156982, + "learning_rate": 9.829012134112222e-06, + "loss": 1.2142, + "step": 1272 + }, + { + "epoch": 0.6469728733879677, + "grad_norm": 2.990753650665283, + "learning_rate": 9.828575946835786e-06, + "loss": 1.3508, + "step": 1273 + }, + { + "epoch": 0.647481100311289, + "grad_norm": 3.1516451835632324, + "learning_rate": 9.828139213617847e-06, + "loss": 1.2211, + "step": 1274 + }, + { + "epoch": 0.6479893272346102, + "grad_norm": 2.989999771118164, + "learning_rate": 9.827701934507785e-06, + "loss": 1.3364, + "step": 1275 + }, + { + "epoch": 0.6484975541579315, + "grad_norm": 2.891176700592041, + "learning_rate": 9.827264109555041e-06, + "loss": 1.2299, + "step": 1276 + }, + { + "epoch": 0.6490057810812527, + "grad_norm": 3.024106025695801, + "learning_rate": 9.826825738809119e-06, + "loss": 1.2658, + "step": 1277 + }, + { + "epoch": 0.649514008004574, + "grad_norm": 3.742095470428467, + "learning_rate": 9.826386822319582e-06, + "loss": 1.2443, + "step": 1278 + }, + { + "epoch": 0.6500222349278953, + "grad_norm": 3.057175397872925, + "learning_rate": 9.825947360136055e-06, + "loss": 1.2077, + "step": 1279 + }, + { + "epoch": 0.6505304618512165, + "grad_norm": 3.2410778999328613, + "learning_rate": 9.825507352308225e-06, + "loss": 1.2809, + "step": 1280 + }, + { + "epoch": 0.6510386887745379, + "grad_norm": 2.82974910736084, + "learning_rate": 9.825066798885843e-06, + "loss": 1.2053, + "step": 1281 + }, + { + "epoch": 0.651546915697859, + "grad_norm": 3.046499013900757, + "learning_rate": 9.824625699918723e-06, + "loss": 1.2027, + "step": 1282 + }, + { + "epoch": 0.6520551426211804, + "grad_norm": 3.305159330368042, + "learning_rate": 9.824184055456729e-06, + "loss": 1.3742, + "step": 1283 + }, + { + "epoch": 0.6525633695445017, + "grad_norm": 3.1315276622772217, + "learning_rate": 9.823741865549805e-06, + "loss": 1.2914, + "step": 1284 + }, + { + "epoch": 0.6530715964678229, + "grad_norm": 3.0194857120513916, + "learning_rate": 9.823299130247941e-06, + "loss": 1.2446, + "step": 1285 + }, + { + "epoch": 0.6535798233911442, + "grad_norm": 2.8847827911376953, + "learning_rate": 9.822855849601198e-06, + "loss": 1.3122, + "step": 1286 + }, + { + "epoch": 0.6540880503144654, + "grad_norm": 3.0671706199645996, + "learning_rate": 9.822412023659692e-06, + "loss": 1.2765, + "step": 1287 + }, + { + "epoch": 0.6545962772377867, + "grad_norm": 2.971421480178833, + "learning_rate": 9.82196765247361e-06, + "loss": 1.2641, + "step": 1288 + }, + { + "epoch": 0.6551045041611079, + "grad_norm": 2.988215923309326, + "learning_rate": 9.821522736093189e-06, + "loss": 1.3037, + "step": 1289 + }, + { + "epoch": 0.6556127310844292, + "grad_norm": 2.7589046955108643, + "learning_rate": 9.821077274568734e-06, + "loss": 1.056, + "step": 1290 + }, + { + "epoch": 0.6561209580077505, + "grad_norm": 2.976534366607666, + "learning_rate": 9.820631267950613e-06, + "loss": 1.1519, + "step": 1291 + }, + { + "epoch": 0.6566291849310717, + "grad_norm": 2.928953170776367, + "learning_rate": 9.820184716289252e-06, + "loss": 1.3055, + "step": 1292 + }, + { + "epoch": 0.657137411854393, + "grad_norm": 3.0303738117218018, + "learning_rate": 9.819737619635143e-06, + "loss": 1.2309, + "step": 1293 + }, + { + "epoch": 0.6576456387777142, + "grad_norm": 3.0870563983917236, + "learning_rate": 9.819289978038833e-06, + "loss": 1.3138, + "step": 1294 + }, + { + "epoch": 0.6581538657010355, + "grad_norm": 2.9288690090179443, + "learning_rate": 9.818841791550938e-06, + "loss": 1.2676, + "step": 1295 + }, + { + "epoch": 0.6586620926243568, + "grad_norm": 2.846304178237915, + "learning_rate": 9.818393060222128e-06, + "loss": 1.2641, + "step": 1296 + }, + { + "epoch": 0.659170319547678, + "grad_norm": 2.9624176025390625, + "learning_rate": 9.817943784103142e-06, + "loss": 1.2804, + "step": 1297 + }, + { + "epoch": 0.6596785464709993, + "grad_norm": 2.7913033962249756, + "learning_rate": 9.817493963244778e-06, + "loss": 1.3064, + "step": 1298 + }, + { + "epoch": 0.6601867733943205, + "grad_norm": 2.988194465637207, + "learning_rate": 9.81704359769789e-06, + "loss": 1.3552, + "step": 1299 + }, + { + "epoch": 0.6606950003176418, + "grad_norm": 5.625545978546143, + "learning_rate": 9.816592687513404e-06, + "loss": 1.2971, + "step": 1300 + }, + { + "epoch": 0.6612032272409631, + "grad_norm": 3.0586233139038086, + "learning_rate": 9.8161412327423e-06, + "loss": 1.4045, + "step": 1301 + }, + { + "epoch": 0.6617114541642843, + "grad_norm": 3.3030478954315186, + "learning_rate": 9.815689233435619e-06, + "loss": 1.2915, + "step": 1302 + }, + { + "epoch": 0.6622196810876056, + "grad_norm": 3.2344744205474854, + "learning_rate": 9.81523668964447e-06, + "loss": 1.199, + "step": 1303 + }, + { + "epoch": 0.6627279080109268, + "grad_norm": 2.973972797393799, + "learning_rate": 9.814783601420018e-06, + "loss": 1.3101, + "step": 1304 + }, + { + "epoch": 0.6632361349342482, + "grad_norm": 3.051959276199341, + "learning_rate": 9.814329968813493e-06, + "loss": 1.3287, + "step": 1305 + }, + { + "epoch": 0.6637443618575694, + "grad_norm": 3.0178143978118896, + "learning_rate": 9.81387579187618e-06, + "loss": 1.1582, + "step": 1306 + }, + { + "epoch": 0.6642525887808907, + "grad_norm": 2.748084306716919, + "learning_rate": 9.813421070659435e-06, + "loss": 1.1526, + "step": 1307 + }, + { + "epoch": 0.664760815704212, + "grad_norm": 3.0890631675720215, + "learning_rate": 9.81296580521467e-06, + "loss": 1.1412, + "step": 1308 + }, + { + "epoch": 0.6652690426275332, + "grad_norm": 3.0133931636810303, + "learning_rate": 9.812509995593357e-06, + "loss": 1.3093, + "step": 1309 + }, + { + "epoch": 0.6657772695508545, + "grad_norm": 2.998985528945923, + "learning_rate": 9.812053641847038e-06, + "loss": 1.2876, + "step": 1310 + }, + { + "epoch": 0.6662854964741757, + "grad_norm": 3.7526612281799316, + "learning_rate": 9.811596744027304e-06, + "loss": 1.3247, + "step": 1311 + }, + { + "epoch": 0.666793723397497, + "grad_norm": 3.112264394760132, + "learning_rate": 9.811139302185817e-06, + "loss": 1.2754, + "step": 1312 + }, + { + "epoch": 0.6673019503208183, + "grad_norm": 3.145580768585205, + "learning_rate": 9.810681316374296e-06, + "loss": 1.3328, + "step": 1313 + }, + { + "epoch": 0.6678101772441395, + "grad_norm": 2.926412343978882, + "learning_rate": 9.810222786644526e-06, + "loss": 1.2873, + "step": 1314 + }, + { + "epoch": 0.6683184041674608, + "grad_norm": 2.8454012870788574, + "learning_rate": 9.809763713048347e-06, + "loss": 1.2252, + "step": 1315 + }, + { + "epoch": 0.668826631090782, + "grad_norm": 3.048414945602417, + "learning_rate": 9.809304095637665e-06, + "loss": 1.2712, + "step": 1316 + }, + { + "epoch": 0.6693348580141033, + "grad_norm": 2.9404375553131104, + "learning_rate": 9.80884393446445e-06, + "loss": 1.1873, + "step": 1317 + }, + { + "epoch": 0.6698430849374246, + "grad_norm": 3.0222291946411133, + "learning_rate": 9.808383229580724e-06, + "loss": 1.27, + "step": 1318 + }, + { + "epoch": 0.6703513118607458, + "grad_norm": 3.297321081161499, + "learning_rate": 9.807921981038581e-06, + "loss": 1.2672, + "step": 1319 + }, + { + "epoch": 0.6708595387840671, + "grad_norm": 3.1562671661376953, + "learning_rate": 9.80746018889017e-06, + "loss": 1.2629, + "step": 1320 + }, + { + "epoch": 0.6713677657073883, + "grad_norm": 2.894879102706909, + "learning_rate": 9.806997853187705e-06, + "loss": 1.2885, + "step": 1321 + }, + { + "epoch": 0.6718759926307096, + "grad_norm": 2.8734283447265625, + "learning_rate": 9.806534973983458e-06, + "loss": 1.2711, + "step": 1322 + }, + { + "epoch": 0.6723842195540308, + "grad_norm": 2.9292004108428955, + "learning_rate": 9.806071551329766e-06, + "loss": 1.2032, + "step": 1323 + }, + { + "epoch": 0.6728924464773521, + "grad_norm": 2.841843843460083, + "learning_rate": 9.805607585279022e-06, + "loss": 1.2444, + "step": 1324 + }, + { + "epoch": 0.6734006734006734, + "grad_norm": 3.2029173374176025, + "learning_rate": 9.80514307588369e-06, + "loss": 1.2899, + "step": 1325 + }, + { + "epoch": 0.6739089003239946, + "grad_norm": 2.921074151992798, + "learning_rate": 9.804678023196286e-06, + "loss": 1.1842, + "step": 1326 + }, + { + "epoch": 0.674417127247316, + "grad_norm": 2.954253673553467, + "learning_rate": 9.80421242726939e-06, + "loss": 1.3056, + "step": 1327 + }, + { + "epoch": 0.6749253541706371, + "grad_norm": 3.026883840560913, + "learning_rate": 9.803746288155647e-06, + "loss": 1.2471, + "step": 1328 + }, + { + "epoch": 0.6754335810939585, + "grad_norm": 2.9767909049987793, + "learning_rate": 9.80327960590776e-06, + "loss": 1.3336, + "step": 1329 + }, + { + "epoch": 0.6759418080172798, + "grad_norm": 2.963109016418457, + "learning_rate": 9.802812380578495e-06, + "loss": 1.1492, + "step": 1330 + }, + { + "epoch": 0.676450034940601, + "grad_norm": 2.853429079055786, + "learning_rate": 9.802344612220677e-06, + "loss": 1.2281, + "step": 1331 + }, + { + "epoch": 0.6769582618639223, + "grad_norm": 2.979201316833496, + "learning_rate": 9.801876300887195e-06, + "loss": 1.2248, + "step": 1332 + }, + { + "epoch": 0.6774664887872435, + "grad_norm": 3.138261318206787, + "learning_rate": 9.801407446631e-06, + "loss": 1.4046, + "step": 1333 + }, + { + "epoch": 0.6779747157105648, + "grad_norm": 3.044326066970825, + "learning_rate": 9.8009380495051e-06, + "loss": 1.2961, + "step": 1334 + }, + { + "epoch": 0.678482942633886, + "grad_norm": 3.0363643169403076, + "learning_rate": 9.80046810956257e-06, + "loss": 1.349, + "step": 1335 + }, + { + "epoch": 0.6789911695572073, + "grad_norm": 2.967984914779663, + "learning_rate": 9.799997626856539e-06, + "loss": 1.2037, + "step": 1336 + }, + { + "epoch": 0.6794993964805286, + "grad_norm": 2.81664776802063, + "learning_rate": 9.799526601440207e-06, + "loss": 1.2094, + "step": 1337 + }, + { + "epoch": 0.6800076234038498, + "grad_norm": 3.0124945640563965, + "learning_rate": 9.79905503336683e-06, + "loss": 1.3336, + "step": 1338 + }, + { + "epoch": 0.6805158503271711, + "grad_norm": 2.7598769664764404, + "learning_rate": 9.798582922689724e-06, + "loss": 1.2539, + "step": 1339 + }, + { + "epoch": 0.6810240772504923, + "grad_norm": 3.0373761653900146, + "learning_rate": 9.798110269462266e-06, + "loss": 1.3217, + "step": 1340 + }, + { + "epoch": 0.6815323041738136, + "grad_norm": 3.097094774246216, + "learning_rate": 9.797637073737901e-06, + "loss": 1.2075, + "step": 1341 + }, + { + "epoch": 0.6820405310971349, + "grad_norm": 2.749882698059082, + "learning_rate": 9.797163335570127e-06, + "loss": 1.3328, + "step": 1342 + }, + { + "epoch": 0.6825487580204561, + "grad_norm": 3.4999477863311768, + "learning_rate": 9.79668905501251e-06, + "loss": 1.3211, + "step": 1343 + }, + { + "epoch": 0.6830569849437774, + "grad_norm": 3.1416807174682617, + "learning_rate": 9.796214232118672e-06, + "loss": 1.3246, + "step": 1344 + }, + { + "epoch": 0.6835652118670986, + "grad_norm": 2.8817014694213867, + "learning_rate": 9.7957388669423e-06, + "loss": 1.2774, + "step": 1345 + }, + { + "epoch": 0.6840734387904199, + "grad_norm": 2.8663389682769775, + "learning_rate": 9.795262959537143e-06, + "loss": 1.287, + "step": 1346 + }, + { + "epoch": 0.6845816657137412, + "grad_norm": 3.0212528705596924, + "learning_rate": 9.794786509957002e-06, + "loss": 1.1961, + "step": 1347 + }, + { + "epoch": 0.6850898926370624, + "grad_norm": 2.8918073177337646, + "learning_rate": 9.794309518255755e-06, + "loss": 1.192, + "step": 1348 + }, + { + "epoch": 0.6855981195603837, + "grad_norm": 2.9363107681274414, + "learning_rate": 9.79383198448733e-06, + "loss": 1.2341, + "step": 1349 + }, + { + "epoch": 0.6861063464837049, + "grad_norm": 2.7646443843841553, + "learning_rate": 9.793353908705716e-06, + "loss": 1.1832, + "step": 1350 + }, + { + "epoch": 0.6866145734070263, + "grad_norm": 2.9691295623779297, + "learning_rate": 9.792875290964971e-06, + "loss": 1.1755, + "step": 1351 + }, + { + "epoch": 0.6871228003303474, + "grad_norm": 2.821946382522583, + "learning_rate": 9.792396131319208e-06, + "loss": 1.263, + "step": 1352 + }, + { + "epoch": 0.6876310272536688, + "grad_norm": 2.7758054733276367, + "learning_rate": 9.791916429822604e-06, + "loss": 1.2741, + "step": 1353 + }, + { + "epoch": 0.6881392541769901, + "grad_norm": 3.110229730606079, + "learning_rate": 9.791436186529392e-06, + "loss": 1.2129, + "step": 1354 + }, + { + "epoch": 0.6886474811003113, + "grad_norm": 3.091493606567383, + "learning_rate": 9.790955401493878e-06, + "loss": 1.2326, + "step": 1355 + }, + { + "epoch": 0.6891557080236326, + "grad_norm": 2.8974857330322266, + "learning_rate": 9.790474074770415e-06, + "loss": 1.2713, + "step": 1356 + }, + { + "epoch": 0.6896639349469538, + "grad_norm": 3.016157627105713, + "learning_rate": 9.789992206413428e-06, + "loss": 1.2726, + "step": 1357 + }, + { + "epoch": 0.6901721618702751, + "grad_norm": 2.9709484577178955, + "learning_rate": 9.7895097964774e-06, + "loss": 1.4299, + "step": 1358 + }, + { + "epoch": 0.6906803887935964, + "grad_norm": 2.8930253982543945, + "learning_rate": 9.789026845016868e-06, + "loss": 1.2822, + "step": 1359 + }, + { + "epoch": 0.6911886157169176, + "grad_norm": 2.8750662803649902, + "learning_rate": 9.788543352086447e-06, + "loss": 1.2785, + "step": 1360 + }, + { + "epoch": 0.6916968426402389, + "grad_norm": 3.3684775829315186, + "learning_rate": 9.788059317740793e-06, + "loss": 1.3986, + "step": 1361 + }, + { + "epoch": 0.6922050695635601, + "grad_norm": 2.6956255435943604, + "learning_rate": 9.78757474203464e-06, + "loss": 1.2541, + "step": 1362 + }, + { + "epoch": 0.6927132964868814, + "grad_norm": 2.7483339309692383, + "learning_rate": 9.787089625022772e-06, + "loss": 1.2703, + "step": 1363 + }, + { + "epoch": 0.6932215234102026, + "grad_norm": 3.469676971435547, + "learning_rate": 9.786603966760042e-06, + "loss": 1.3139, + "step": 1364 + }, + { + "epoch": 0.6937297503335239, + "grad_norm": 2.8216028213500977, + "learning_rate": 9.786117767301359e-06, + "loss": 1.2917, + "step": 1365 + }, + { + "epoch": 0.6942379772568452, + "grad_norm": 2.97011399269104, + "learning_rate": 9.785631026701695e-06, + "loss": 1.2288, + "step": 1366 + }, + { + "epoch": 0.6947462041801664, + "grad_norm": 3.1733460426330566, + "learning_rate": 9.785143745016085e-06, + "loss": 1.3337, + "step": 1367 + }, + { + "epoch": 0.6952544311034877, + "grad_norm": 3.0609326362609863, + "learning_rate": 9.78465592229962e-06, + "loss": 1.1612, + "step": 1368 + }, + { + "epoch": 0.6957626580268089, + "grad_norm": 2.876577854156494, + "learning_rate": 9.78416755860746e-06, + "loss": 1.3396, + "step": 1369 + }, + { + "epoch": 0.6962708849501302, + "grad_norm": 2.9949982166290283, + "learning_rate": 9.783678653994817e-06, + "loss": 1.1953, + "step": 1370 + }, + { + "epoch": 0.6967791118734515, + "grad_norm": 3.092203140258789, + "learning_rate": 9.783189208516972e-06, + "loss": 1.1856, + "step": 1371 + }, + { + "epoch": 0.6972873387967727, + "grad_norm": 2.965151071548462, + "learning_rate": 9.782699222229264e-06, + "loss": 1.2374, + "step": 1372 + }, + { + "epoch": 0.697795565720094, + "grad_norm": 2.849785327911377, + "learning_rate": 9.78220869518709e-06, + "loss": 1.2187, + "step": 1373 + }, + { + "epoch": 0.6983037926434152, + "grad_norm": 3.1366140842437744, + "learning_rate": 9.781717627445915e-06, + "loss": 1.3324, + "step": 1374 + }, + { + "epoch": 0.6988120195667366, + "grad_norm": 2.859644889831543, + "learning_rate": 9.78122601906126e-06, + "loss": 1.2878, + "step": 1375 + }, + { + "epoch": 0.6993202464900579, + "grad_norm": 2.927549123764038, + "learning_rate": 9.780733870088708e-06, + "loss": 1.3861, + "step": 1376 + }, + { + "epoch": 0.6998284734133791, + "grad_norm": 2.8348424434661865, + "learning_rate": 9.780241180583905e-06, + "loss": 1.178, + "step": 1377 + }, + { + "epoch": 0.7003367003367004, + "grad_norm": 3.0390775203704834, + "learning_rate": 9.779747950602553e-06, + "loss": 1.312, + "step": 1378 + }, + { + "epoch": 0.7008449272600216, + "grad_norm": 3.0308146476745605, + "learning_rate": 9.779254180200426e-06, + "loss": 1.2044, + "step": 1379 + }, + { + "epoch": 0.7013531541833429, + "grad_norm": 2.860550880432129, + "learning_rate": 9.778759869433345e-06, + "loss": 1.3131, + "step": 1380 + }, + { + "epoch": 0.7018613811066641, + "grad_norm": 3.319129705429077, + "learning_rate": 9.778265018357203e-06, + "loss": 1.2236, + "step": 1381 + }, + { + "epoch": 0.7023696080299854, + "grad_norm": 2.9930241107940674, + "learning_rate": 9.77776962702795e-06, + "loss": 1.249, + "step": 1382 + }, + { + "epoch": 0.7028778349533067, + "grad_norm": 2.9247124195098877, + "learning_rate": 9.777273695501594e-06, + "loss": 1.2426, + "step": 1383 + }, + { + "epoch": 0.7033860618766279, + "grad_norm": 3.4090874195098877, + "learning_rate": 9.776777223834212e-06, + "loss": 1.1573, + "step": 1384 + }, + { + "epoch": 0.7038942887999492, + "grad_norm": 3.1676511764526367, + "learning_rate": 9.776280212081934e-06, + "loss": 1.2312, + "step": 1385 + }, + { + "epoch": 0.7044025157232704, + "grad_norm": 3.1893248558044434, + "learning_rate": 9.775782660300957e-06, + "loss": 1.2459, + "step": 1386 + }, + { + "epoch": 0.7049107426465917, + "grad_norm": 2.791271686553955, + "learning_rate": 9.775284568547536e-06, + "loss": 1.156, + "step": 1387 + }, + { + "epoch": 0.705418969569913, + "grad_norm": 3.0256097316741943, + "learning_rate": 9.774785936877983e-06, + "loss": 1.3832, + "step": 1388 + }, + { + "epoch": 0.7059271964932342, + "grad_norm": 3.114658832550049, + "learning_rate": 9.774286765348684e-06, + "loss": 1.3485, + "step": 1389 + }, + { + "epoch": 0.7064354234165555, + "grad_norm": 2.794233798980713, + "learning_rate": 9.77378705401607e-06, + "loss": 1.1272, + "step": 1390 + }, + { + "epoch": 0.7069436503398767, + "grad_norm": 3.010028123855591, + "learning_rate": 9.773286802936644e-06, + "loss": 1.2159, + "step": 1391 + }, + { + "epoch": 0.707451877263198, + "grad_norm": 2.803492307662964, + "learning_rate": 9.772786012166968e-06, + "loss": 1.1581, + "step": 1392 + }, + { + "epoch": 0.7079601041865193, + "grad_norm": 2.8336427211761475, + "learning_rate": 9.772284681763662e-06, + "loss": 1.2794, + "step": 1393 + }, + { + "epoch": 0.7084683311098405, + "grad_norm": 3.0411875247955322, + "learning_rate": 9.771782811783408e-06, + "loss": 1.2202, + "step": 1394 + }, + { + "epoch": 0.7089765580331618, + "grad_norm": 3.8096001148223877, + "learning_rate": 9.771280402282953e-06, + "loss": 1.3383, + "step": 1395 + }, + { + "epoch": 0.709484784956483, + "grad_norm": 3.175851821899414, + "learning_rate": 9.770777453319098e-06, + "loss": 1.3495, + "step": 1396 + }, + { + "epoch": 0.7099930118798043, + "grad_norm": 3.015300989151001, + "learning_rate": 9.77027396494871e-06, + "loss": 1.2694, + "step": 1397 + }, + { + "epoch": 0.7105012388031255, + "grad_norm": 4.530679225921631, + "learning_rate": 9.769769937228716e-06, + "loss": 1.2853, + "step": 1398 + }, + { + "epoch": 0.7110094657264469, + "grad_norm": 2.898129463195801, + "learning_rate": 9.769265370216106e-06, + "loss": 1.223, + "step": 1399 + }, + { + "epoch": 0.7115176926497682, + "grad_norm": 3.0743815898895264, + "learning_rate": 9.768760263967927e-06, + "loss": 1.2532, + "step": 1400 + }, + { + "epoch": 0.7120259195730894, + "grad_norm": 2.855799674987793, + "learning_rate": 9.768254618541287e-06, + "loss": 1.2243, + "step": 1401 + }, + { + "epoch": 0.7125341464964107, + "grad_norm": 2.8209400177001953, + "learning_rate": 9.767748433993357e-06, + "loss": 1.2282, + "step": 1402 + }, + { + "epoch": 0.7130423734197319, + "grad_norm": 2.9385292530059814, + "learning_rate": 9.767241710381372e-06, + "loss": 1.3617, + "step": 1403 + }, + { + "epoch": 0.7135506003430532, + "grad_norm": 2.8516132831573486, + "learning_rate": 9.76673444776262e-06, + "loss": 1.271, + "step": 1404 + }, + { + "epoch": 0.7140588272663745, + "grad_norm": 2.887547254562378, + "learning_rate": 9.766226646194459e-06, + "loss": 1.1764, + "step": 1405 + }, + { + "epoch": 0.7145670541896957, + "grad_norm": 2.8994688987731934, + "learning_rate": 9.765718305734299e-06, + "loss": 1.1985, + "step": 1406 + }, + { + "epoch": 0.715075281113017, + "grad_norm": 3.094647169113159, + "learning_rate": 9.765209426439619e-06, + "loss": 1.2047, + "step": 1407 + }, + { + "epoch": 0.7155835080363382, + "grad_norm": 3.0000064373016357, + "learning_rate": 9.764700008367952e-06, + "loss": 1.175, + "step": 1408 + }, + { + "epoch": 0.7160917349596595, + "grad_norm": 2.8988466262817383, + "learning_rate": 9.764190051576898e-06, + "loss": 1.2322, + "step": 1409 + }, + { + "epoch": 0.7165999618829807, + "grad_norm": 2.796241044998169, + "learning_rate": 9.763679556124115e-06, + "loss": 1.2739, + "step": 1410 + }, + { + "epoch": 0.717108188806302, + "grad_norm": 2.8092799186706543, + "learning_rate": 9.76316852206732e-06, + "loss": 1.2592, + "step": 1411 + }, + { + "epoch": 0.7176164157296233, + "grad_norm": 2.8349976539611816, + "learning_rate": 9.762656949464293e-06, + "loss": 1.2057, + "step": 1412 + }, + { + "epoch": 0.7181246426529445, + "grad_norm": 2.937993288040161, + "learning_rate": 9.762144838372879e-06, + "loss": 1.2728, + "step": 1413 + }, + { + "epoch": 0.7186328695762658, + "grad_norm": 2.7717621326446533, + "learning_rate": 9.761632188850973e-06, + "loss": 1.1492, + "step": 1414 + }, + { + "epoch": 0.719141096499587, + "grad_norm": 2.7713875770568848, + "learning_rate": 9.761119000956543e-06, + "loss": 1.1935, + "step": 1415 + }, + { + "epoch": 0.7196493234229083, + "grad_norm": 3.239586353302002, + "learning_rate": 9.76060527474761e-06, + "loss": 1.2105, + "step": 1416 + }, + { + "epoch": 0.7201575503462296, + "grad_norm": 2.891342878341675, + "learning_rate": 9.76009101028226e-06, + "loss": 1.2722, + "step": 1417 + }, + { + "epoch": 0.7206657772695508, + "grad_norm": 3.0239803791046143, + "learning_rate": 9.759576207618636e-06, + "loss": 1.2555, + "step": 1418 + }, + { + "epoch": 0.7211740041928721, + "grad_norm": 2.953406810760498, + "learning_rate": 9.759060866814944e-06, + "loss": 1.2832, + "step": 1419 + }, + { + "epoch": 0.7216822311161933, + "grad_norm": 2.8011319637298584, + "learning_rate": 9.758544987929453e-06, + "loss": 1.1223, + "step": 1420 + }, + { + "epoch": 0.7221904580395146, + "grad_norm": 2.819378137588501, + "learning_rate": 9.758028571020489e-06, + "loss": 1.2726, + "step": 1421 + }, + { + "epoch": 0.722698684962836, + "grad_norm": 2.6413331031799316, + "learning_rate": 9.757511616146441e-06, + "loss": 1.185, + "step": 1422 + }, + { + "epoch": 0.7232069118861572, + "grad_norm": 2.5989086627960205, + "learning_rate": 9.75699412336576e-06, + "loss": 1.2007, + "step": 1423 + }, + { + "epoch": 0.7237151388094785, + "grad_norm": 2.8236801624298096, + "learning_rate": 9.756476092736953e-06, + "loss": 1.1923, + "step": 1424 + }, + { + "epoch": 0.7242233657327997, + "grad_norm": 2.875715970993042, + "learning_rate": 9.755957524318592e-06, + "loss": 1.2214, + "step": 1425 + }, + { + "epoch": 0.724731592656121, + "grad_norm": 2.9543588161468506, + "learning_rate": 9.75543841816931e-06, + "loss": 1.232, + "step": 1426 + }, + { + "epoch": 0.7252398195794422, + "grad_norm": 3.108790874481201, + "learning_rate": 9.7549187743478e-06, + "loss": 1.2526, + "step": 1427 + }, + { + "epoch": 0.7257480465027635, + "grad_norm": 3.0500638484954834, + "learning_rate": 9.754398592912813e-06, + "loss": 1.2936, + "step": 1428 + }, + { + "epoch": 0.7262562734260848, + "grad_norm": 2.8262805938720703, + "learning_rate": 9.753877873923164e-06, + "loss": 1.1733, + "step": 1429 + }, + { + "epoch": 0.726764500349406, + "grad_norm": 3.081902265548706, + "learning_rate": 9.75335661743773e-06, + "loss": 1.2526, + "step": 1430 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 2.996305465698242, + "learning_rate": 9.752834823515444e-06, + "loss": 1.2552, + "step": 1431 + }, + { + "epoch": 0.7277809541960485, + "grad_norm": 3.2910454273223877, + "learning_rate": 9.752312492215304e-06, + "loss": 1.2484, + "step": 1432 + }, + { + "epoch": 0.7282891811193698, + "grad_norm": 3.036968469619751, + "learning_rate": 9.751789623596366e-06, + "loss": 1.2597, + "step": 1433 + }, + { + "epoch": 0.7287974080426911, + "grad_norm": 2.843050956726074, + "learning_rate": 9.75126621771775e-06, + "loss": 1.2877, + "step": 1434 + }, + { + "epoch": 0.7293056349660123, + "grad_norm": 2.860912561416626, + "learning_rate": 9.750742274638632e-06, + "loss": 1.2826, + "step": 1435 + }, + { + "epoch": 0.7298138618893336, + "grad_norm": 2.9277420043945312, + "learning_rate": 9.750217794418254e-06, + "loss": 1.241, + "step": 1436 + }, + { + "epoch": 0.7303220888126548, + "grad_norm": 2.8361499309539795, + "learning_rate": 9.749692777115916e-06, + "loss": 1.2782, + "step": 1437 + }, + { + "epoch": 0.7308303157359761, + "grad_norm": 2.8240644931793213, + "learning_rate": 9.749167222790976e-06, + "loss": 1.1875, + "step": 1438 + }, + { + "epoch": 0.7313385426592974, + "grad_norm": 3.042060613632202, + "learning_rate": 9.748641131502858e-06, + "loss": 1.267, + "step": 1439 + }, + { + "epoch": 0.7318467695826186, + "grad_norm": 3.223292827606201, + "learning_rate": 9.748114503311045e-06, + "loss": 1.2628, + "step": 1440 + }, + { + "epoch": 0.7323549965059399, + "grad_norm": 2.960662841796875, + "learning_rate": 9.74758733827508e-06, + "loss": 1.2386, + "step": 1441 + }, + { + "epoch": 0.7328632234292611, + "grad_norm": 3.0385453701019287, + "learning_rate": 9.747059636454566e-06, + "loss": 1.1821, + "step": 1442 + }, + { + "epoch": 0.7333714503525824, + "grad_norm": 2.8012921810150146, + "learning_rate": 9.746531397909165e-06, + "loss": 1.1459, + "step": 1443 + }, + { + "epoch": 0.7338796772759036, + "grad_norm": 2.8723814487457275, + "learning_rate": 9.746002622698607e-06, + "loss": 1.227, + "step": 1444 + }, + { + "epoch": 0.734387904199225, + "grad_norm": 2.9052135944366455, + "learning_rate": 9.745473310882674e-06, + "loss": 1.2176, + "step": 1445 + }, + { + "epoch": 0.7348961311225463, + "grad_norm": 2.8227717876434326, + "learning_rate": 9.744943462521214e-06, + "loss": 1.2584, + "step": 1446 + }, + { + "epoch": 0.7354043580458675, + "grad_norm": 2.986020565032959, + "learning_rate": 9.744413077674134e-06, + "loss": 1.2, + "step": 1447 + }, + { + "epoch": 0.7359125849691888, + "grad_norm": 3.091575860977173, + "learning_rate": 9.7438821564014e-06, + "loss": 1.1782, + "step": 1448 + }, + { + "epoch": 0.73642081189251, + "grad_norm": 2.812776565551758, + "learning_rate": 9.743350698763046e-06, + "loss": 1.2385, + "step": 1449 + }, + { + "epoch": 0.7369290388158313, + "grad_norm": 3.120871067047119, + "learning_rate": 9.742818704819155e-06, + "loss": 1.2487, + "step": 1450 + }, + { + "epoch": 0.7374372657391526, + "grad_norm": 2.802520513534546, + "learning_rate": 9.742286174629879e-06, + "loss": 1.2003, + "step": 1451 + }, + { + "epoch": 0.7379454926624738, + "grad_norm": 3.259707450866699, + "learning_rate": 9.741753108255429e-06, + "loss": 1.2654, + "step": 1452 + }, + { + "epoch": 0.7384537195857951, + "grad_norm": 2.960662841796875, + "learning_rate": 9.741219505756074e-06, + "loss": 1.2144, + "step": 1453 + }, + { + "epoch": 0.7389619465091163, + "grad_norm": 3.017399787902832, + "learning_rate": 9.740685367192149e-06, + "loss": 1.1627, + "step": 1454 + }, + { + "epoch": 0.7394701734324376, + "grad_norm": 2.763535737991333, + "learning_rate": 9.740150692624044e-06, + "loss": 1.2747, + "step": 1455 + }, + { + "epoch": 0.7399784003557588, + "grad_norm": 2.646120309829712, + "learning_rate": 9.73961548211221e-06, + "loss": 1.1098, + "step": 1456 + }, + { + "epoch": 0.7404866272790801, + "grad_norm": 3.0598561763763428, + "learning_rate": 9.739079735717165e-06, + "loss": 1.2503, + "step": 1457 + }, + { + "epoch": 0.7409948542024014, + "grad_norm": 3.1667909622192383, + "learning_rate": 9.738543453499478e-06, + "loss": 1.2446, + "step": 1458 + }, + { + "epoch": 0.7415030811257226, + "grad_norm": 3.006512403488159, + "learning_rate": 9.738006635519788e-06, + "loss": 1.2218, + "step": 1459 + }, + { + "epoch": 0.7420113080490439, + "grad_norm": 3.4957993030548096, + "learning_rate": 9.737469281838786e-06, + "loss": 1.32, + "step": 1460 + }, + { + "epoch": 0.7425195349723651, + "grad_norm": 3.0907366275787354, + "learning_rate": 9.736931392517234e-06, + "loss": 1.2451, + "step": 1461 + }, + { + "epoch": 0.7430277618956864, + "grad_norm": 3.0201332569122314, + "learning_rate": 9.736392967615941e-06, + "loss": 1.2959, + "step": 1462 + }, + { + "epoch": 0.7435359888190077, + "grad_norm": 2.7725820541381836, + "learning_rate": 9.735854007195789e-06, + "loss": 1.2061, + "step": 1463 + }, + { + "epoch": 0.7440442157423289, + "grad_norm": 3.0488088130950928, + "learning_rate": 9.735314511317711e-06, + "loss": 1.2159, + "step": 1464 + }, + { + "epoch": 0.7445524426656502, + "grad_norm": 3.0015316009521484, + "learning_rate": 9.73477448004271e-06, + "loss": 1.3594, + "step": 1465 + }, + { + "epoch": 0.7450606695889714, + "grad_norm": 3.141895294189453, + "learning_rate": 9.73423391343184e-06, + "loss": 1.297, + "step": 1466 + }, + { + "epoch": 0.7455688965122927, + "grad_norm": 2.7780303955078125, + "learning_rate": 9.733692811546222e-06, + "loss": 1.1672, + "step": 1467 + }, + { + "epoch": 0.746077123435614, + "grad_norm": 2.9647746086120605, + "learning_rate": 9.733151174447038e-06, + "loss": 1.3291, + "step": 1468 + }, + { + "epoch": 0.7465853503589353, + "grad_norm": 3.054515838623047, + "learning_rate": 9.732609002195523e-06, + "loss": 1.2656, + "step": 1469 + }, + { + "epoch": 0.7470935772822566, + "grad_norm": 2.7921688556671143, + "learning_rate": 9.73206629485298e-06, + "loss": 1.2288, + "step": 1470 + }, + { + "epoch": 0.7476018042055778, + "grad_norm": 3.1555871963500977, + "learning_rate": 9.731523052480772e-06, + "loss": 1.2941, + "step": 1471 + }, + { + "epoch": 0.7481100311288991, + "grad_norm": 3.1695942878723145, + "learning_rate": 9.730979275140318e-06, + "loss": 1.3829, + "step": 1472 + }, + { + "epoch": 0.7486182580522203, + "grad_norm": 2.928703546524048, + "learning_rate": 9.730434962893098e-06, + "loss": 1.143, + "step": 1473 + }, + { + "epoch": 0.7491264849755416, + "grad_norm": 2.8269565105438232, + "learning_rate": 9.72989011580066e-06, + "loss": 1.1911, + "step": 1474 + }, + { + "epoch": 0.7496347118988629, + "grad_norm": 2.864147663116455, + "learning_rate": 9.729344733924603e-06, + "loss": 1.3372, + "step": 1475 + }, + { + "epoch": 0.7501429388221841, + "grad_norm": 2.9000654220581055, + "learning_rate": 9.728798817326592e-06, + "loss": 1.2584, + "step": 1476 + }, + { + "epoch": 0.7506511657455054, + "grad_norm": 2.9683735370635986, + "learning_rate": 9.72825236606835e-06, + "loss": 1.2438, + "step": 1477 + }, + { + "epoch": 0.7511593926688266, + "grad_norm": 3.1077730655670166, + "learning_rate": 9.727705380211662e-06, + "loss": 1.2655, + "step": 1478 + }, + { + "epoch": 0.7516676195921479, + "grad_norm": 2.839165687561035, + "learning_rate": 9.727157859818372e-06, + "loss": 1.2896, + "step": 1479 + }, + { + "epoch": 0.7521758465154692, + "grad_norm": 2.8478798866271973, + "learning_rate": 9.726609804950388e-06, + "loss": 1.2452, + "step": 1480 + }, + { + "epoch": 0.7526840734387904, + "grad_norm": 3.012943744659424, + "learning_rate": 9.72606121566967e-06, + "loss": 1.2447, + "step": 1481 + }, + { + "epoch": 0.7531923003621117, + "grad_norm": 2.7149770259857178, + "learning_rate": 9.725512092038251e-06, + "loss": 1.1905, + "step": 1482 + }, + { + "epoch": 0.7537005272854329, + "grad_norm": 2.8013172149658203, + "learning_rate": 9.724962434118213e-06, + "loss": 1.0993, + "step": 1483 + }, + { + "epoch": 0.7542087542087542, + "grad_norm": 2.8769729137420654, + "learning_rate": 9.724412241971703e-06, + "loss": 1.3132, + "step": 1484 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 2.906467914581299, + "learning_rate": 9.723861515660931e-06, + "loss": 1.2811, + "step": 1485 + }, + { + "epoch": 0.7552252080553967, + "grad_norm": 2.7540318965911865, + "learning_rate": 9.72331025524816e-06, + "loss": 1.2457, + "step": 1486 + }, + { + "epoch": 0.755733434978718, + "grad_norm": 3.0037455558776855, + "learning_rate": 9.722758460795723e-06, + "loss": 1.2976, + "step": 1487 + }, + { + "epoch": 0.7562416619020392, + "grad_norm": 3.0428314208984375, + "learning_rate": 9.722206132366008e-06, + "loss": 1.2379, + "step": 1488 + }, + { + "epoch": 0.7567498888253605, + "grad_norm": 2.7325022220611572, + "learning_rate": 9.721653270021461e-06, + "loss": 1.2126, + "step": 1489 + }, + { + "epoch": 0.7572581157486817, + "grad_norm": 2.63283371925354, + "learning_rate": 9.72109987382459e-06, + "loss": 1.2667, + "step": 1490 + }, + { + "epoch": 0.757766342672003, + "grad_norm": 2.848900556564331, + "learning_rate": 9.720545943837972e-06, + "loss": 1.2651, + "step": 1491 + }, + { + "epoch": 0.7582745695953244, + "grad_norm": 2.9327495098114014, + "learning_rate": 9.71999148012423e-06, + "loss": 1.2489, + "step": 1492 + }, + { + "epoch": 0.7587827965186456, + "grad_norm": 3.18332576751709, + "learning_rate": 9.719436482746054e-06, + "loss": 1.3644, + "step": 1493 + }, + { + "epoch": 0.7592910234419669, + "grad_norm": 2.8493423461914062, + "learning_rate": 9.718880951766201e-06, + "loss": 1.1427, + "step": 1494 + }, + { + "epoch": 0.7597992503652881, + "grad_norm": 3.0256540775299072, + "learning_rate": 9.718324887247475e-06, + "loss": 1.3127, + "step": 1495 + }, + { + "epoch": 0.7603074772886094, + "grad_norm": 2.7205774784088135, + "learning_rate": 9.717768289252752e-06, + "loss": 1.1484, + "step": 1496 + }, + { + "epoch": 0.7608157042119307, + "grad_norm": 2.971435546875, + "learning_rate": 9.717211157844962e-06, + "loss": 1.2894, + "step": 1497 + }, + { + "epoch": 0.7613239311352519, + "grad_norm": 3.055706262588501, + "learning_rate": 9.716653493087096e-06, + "loss": 1.2505, + "step": 1498 + }, + { + "epoch": 0.7618321580585732, + "grad_norm": 2.809715747833252, + "learning_rate": 9.716095295042207e-06, + "loss": 1.1809, + "step": 1499 + }, + { + "epoch": 0.7623403849818944, + "grad_norm": 2.8183910846710205, + "learning_rate": 9.715536563773407e-06, + "loss": 1.148, + "step": 1500 + }, + { + "epoch": 0.7623403849818944, + "eval_loss": 1.2643159627914429, + "eval_runtime": 12.322, + "eval_samples_per_second": 32.462, + "eval_steps_per_second": 4.058, + "step": 1500 + }, + { + "epoch": 0.7628486119052157, + "grad_norm": 2.898142099380493, + "learning_rate": 9.71497729934387e-06, + "loss": 1.2616, + "step": 1501 + }, + { + "epoch": 0.7633568388285369, + "grad_norm": 2.7970736026763916, + "learning_rate": 9.714417501816826e-06, + "loss": 1.2414, + "step": 1502 + }, + { + "epoch": 0.7638650657518582, + "grad_norm": 2.9098377227783203, + "learning_rate": 9.713857171255574e-06, + "loss": 1.2983, + "step": 1503 + }, + { + "epoch": 0.7643732926751795, + "grad_norm": 2.860549211502075, + "learning_rate": 9.713296307723463e-06, + "loss": 1.1495, + "step": 1504 + }, + { + "epoch": 0.7648815195985007, + "grad_norm": 2.819836378097534, + "learning_rate": 9.712734911283907e-06, + "loss": 1.1737, + "step": 1505 + }, + { + "epoch": 0.765389746521822, + "grad_norm": 3.5737171173095703, + "learning_rate": 9.712172982000382e-06, + "loss": 1.3854, + "step": 1506 + }, + { + "epoch": 0.7658979734451432, + "grad_norm": 3.0363149642944336, + "learning_rate": 9.71161051993642e-06, + "loss": 1.2698, + "step": 1507 + }, + { + "epoch": 0.7664062003684645, + "grad_norm": 3.0048258304595947, + "learning_rate": 9.711047525155619e-06, + "loss": 1.3692, + "step": 1508 + }, + { + "epoch": 0.7669144272917858, + "grad_norm": 2.9466333389282227, + "learning_rate": 9.710483997721633e-06, + "loss": 1.2379, + "step": 1509 + }, + { + "epoch": 0.767422654215107, + "grad_norm": 2.9100375175476074, + "learning_rate": 9.709919937698175e-06, + "loss": 1.1373, + "step": 1510 + }, + { + "epoch": 0.7679308811384283, + "grad_norm": 2.9696006774902344, + "learning_rate": 9.70935534514902e-06, + "loss": 1.2764, + "step": 1511 + }, + { + "epoch": 0.7684391080617495, + "grad_norm": 2.826723098754883, + "learning_rate": 9.708790220138007e-06, + "loss": 1.2072, + "step": 1512 + }, + { + "epoch": 0.7689473349850708, + "grad_norm": 3.223733425140381, + "learning_rate": 9.708224562729027e-06, + "loss": 1.2815, + "step": 1513 + }, + { + "epoch": 0.7694555619083921, + "grad_norm": 2.8028769493103027, + "learning_rate": 9.70765837298604e-06, + "loss": 1.2197, + "step": 1514 + }, + { + "epoch": 0.7699637888317133, + "grad_norm": 2.8905370235443115, + "learning_rate": 9.707091650973061e-06, + "loss": 1.3065, + "step": 1515 + }, + { + "epoch": 0.7704720157550347, + "grad_norm": 2.9921021461486816, + "learning_rate": 9.706524396754164e-06, + "loss": 1.3296, + "step": 1516 + }, + { + "epoch": 0.7709802426783559, + "grad_norm": 2.9344661235809326, + "learning_rate": 9.70595661039349e-06, + "loss": 1.4179, + "step": 1517 + }, + { + "epoch": 0.7714884696016772, + "grad_norm": 2.6728525161743164, + "learning_rate": 9.70538829195523e-06, + "loss": 1.2245, + "step": 1518 + }, + { + "epoch": 0.7719966965249984, + "grad_norm": 2.7900071144104004, + "learning_rate": 9.704819441503646e-06, + "loss": 1.1504, + "step": 1519 + }, + { + "epoch": 0.7725049234483197, + "grad_norm": 3.0739340782165527, + "learning_rate": 9.704250059103051e-06, + "loss": 1.2744, + "step": 1520 + }, + { + "epoch": 0.773013150371641, + "grad_norm": 2.846035957336426, + "learning_rate": 9.703680144817821e-06, + "loss": 1.0986, + "step": 1521 + }, + { + "epoch": 0.7735213772949622, + "grad_norm": 3.0878632068634033, + "learning_rate": 9.703109698712401e-06, + "loss": 1.324, + "step": 1522 + }, + { + "epoch": 0.7740296042182835, + "grad_norm": 2.9029667377471924, + "learning_rate": 9.702538720851279e-06, + "loss": 1.2852, + "step": 1523 + }, + { + "epoch": 0.7745378311416047, + "grad_norm": 2.980501890182495, + "learning_rate": 9.701967211299017e-06, + "loss": 1.2395, + "step": 1524 + }, + { + "epoch": 0.775046058064926, + "grad_norm": 2.8804404735565186, + "learning_rate": 9.701395170120233e-06, + "loss": 1.1636, + "step": 1525 + }, + { + "epoch": 0.7755542849882473, + "grad_norm": 2.804990768432617, + "learning_rate": 9.700822597379604e-06, + "loss": 1.0939, + "step": 1526 + }, + { + "epoch": 0.7760625119115685, + "grad_norm": 2.904367208480835, + "learning_rate": 9.700249493141867e-06, + "loss": 1.3072, + "step": 1527 + }, + { + "epoch": 0.7765707388348898, + "grad_norm": 3.0249783992767334, + "learning_rate": 9.69967585747182e-06, + "loss": 1.274, + "step": 1528 + }, + { + "epoch": 0.777078965758211, + "grad_norm": 2.8509297370910645, + "learning_rate": 9.69910169043432e-06, + "loss": 1.2317, + "step": 1529 + }, + { + "epoch": 0.7775871926815323, + "grad_norm": 3.515911102294922, + "learning_rate": 9.698526992094288e-06, + "loss": 1.2212, + "step": 1530 + }, + { + "epoch": 0.7780954196048536, + "grad_norm": 2.891103982925415, + "learning_rate": 9.6979517625167e-06, + "loss": 1.2583, + "step": 1531 + }, + { + "epoch": 0.7786036465281748, + "grad_norm": 2.970613956451416, + "learning_rate": 9.697376001766595e-06, + "loss": 1.1725, + "step": 1532 + }, + { + "epoch": 0.7791118734514961, + "grad_norm": 2.938046932220459, + "learning_rate": 9.69679970990907e-06, + "loss": 1.2778, + "step": 1533 + }, + { + "epoch": 0.7796201003748173, + "grad_norm": 2.8662068843841553, + "learning_rate": 9.696222887009283e-06, + "loss": 1.2765, + "step": 1534 + }, + { + "epoch": 0.7801283272981386, + "grad_norm": 2.9136219024658203, + "learning_rate": 9.695645533132455e-06, + "loss": 1.2756, + "step": 1535 + }, + { + "epoch": 0.7806365542214598, + "grad_norm": 2.9310011863708496, + "learning_rate": 9.695067648343862e-06, + "loss": 1.2819, + "step": 1536 + }, + { + "epoch": 0.7811447811447811, + "grad_norm": 3.0941317081451416, + "learning_rate": 9.694489232708843e-06, + "loss": 1.2342, + "step": 1537 + }, + { + "epoch": 0.7816530080681025, + "grad_norm": 2.9651567935943604, + "learning_rate": 9.693910286292797e-06, + "loss": 1.3028, + "step": 1538 + }, + { + "epoch": 0.7821612349914236, + "grad_norm": 2.940019130706787, + "learning_rate": 9.69333080916118e-06, + "loss": 1.1719, + "step": 1539 + }, + { + "epoch": 0.782669461914745, + "grad_norm": 2.8346259593963623, + "learning_rate": 9.692750801379514e-06, + "loss": 1.3167, + "step": 1540 + }, + { + "epoch": 0.7831776888380662, + "grad_norm": 2.784411907196045, + "learning_rate": 9.692170263013376e-06, + "loss": 1.2454, + "step": 1541 + }, + { + "epoch": 0.7836859157613875, + "grad_norm": 2.9267518520355225, + "learning_rate": 9.691589194128403e-06, + "loss": 1.219, + "step": 1542 + }, + { + "epoch": 0.7841941426847088, + "grad_norm": 2.6732523441314697, + "learning_rate": 9.691007594790295e-06, + "loss": 1.2958, + "step": 1543 + }, + { + "epoch": 0.78470236960803, + "grad_norm": 3.058943510055542, + "learning_rate": 9.69042546506481e-06, + "loss": 1.3182, + "step": 1544 + }, + { + "epoch": 0.7852105965313513, + "grad_norm": 2.853072166442871, + "learning_rate": 9.689842805017765e-06, + "loss": 1.2758, + "step": 1545 + }, + { + "epoch": 0.7857188234546725, + "grad_norm": 3.0760834217071533, + "learning_rate": 9.689259614715039e-06, + "loss": 1.2394, + "step": 1546 + }, + { + "epoch": 0.7862270503779938, + "grad_norm": 2.931668758392334, + "learning_rate": 9.688675894222572e-06, + "loss": 1.3268, + "step": 1547 + }, + { + "epoch": 0.786735277301315, + "grad_norm": 2.7671284675598145, + "learning_rate": 9.68809164360636e-06, + "loss": 1.2555, + "step": 1548 + }, + { + "epoch": 0.7872435042246363, + "grad_norm": 3.0845117568969727, + "learning_rate": 9.687506862932464e-06, + "loss": 1.2875, + "step": 1549 + }, + { + "epoch": 0.7877517311479576, + "grad_norm": 3.1043455600738525, + "learning_rate": 9.686921552266997e-06, + "loss": 1.2578, + "step": 1550 + }, + { + "epoch": 0.7882599580712788, + "grad_norm": 2.8478760719299316, + "learning_rate": 9.686335711676142e-06, + "loss": 1.2669, + "step": 1551 + }, + { + "epoch": 0.7887681849946001, + "grad_norm": 2.740041494369507, + "learning_rate": 9.685749341226134e-06, + "loss": 1.2157, + "step": 1552 + }, + { + "epoch": 0.7892764119179213, + "grad_norm": 2.8490264415740967, + "learning_rate": 9.685162440983272e-06, + "loss": 1.2503, + "step": 1553 + }, + { + "epoch": 0.7897846388412426, + "grad_norm": 2.845862865447998, + "learning_rate": 9.684575011013912e-06, + "loss": 1.3621, + "step": 1554 + }, + { + "epoch": 0.7902928657645639, + "grad_norm": 2.9016470909118652, + "learning_rate": 9.683987051384475e-06, + "loss": 1.3163, + "step": 1555 + }, + { + "epoch": 0.7908010926878851, + "grad_norm": 3.1869518756866455, + "learning_rate": 9.683398562161434e-06, + "loss": 1.302, + "step": 1556 + }, + { + "epoch": 0.7913093196112064, + "grad_norm": 3.030754327774048, + "learning_rate": 9.68280954341133e-06, + "loss": 1.3103, + "step": 1557 + }, + { + "epoch": 0.7918175465345276, + "grad_norm": 3.1585705280303955, + "learning_rate": 9.68221999520076e-06, + "loss": 1.37, + "step": 1558 + }, + { + "epoch": 0.7923257734578489, + "grad_norm": 2.867959976196289, + "learning_rate": 9.68162991759638e-06, + "loss": 1.17, + "step": 1559 + }, + { + "epoch": 0.7928340003811702, + "grad_norm": 3.2136871814727783, + "learning_rate": 9.681039310664906e-06, + "loss": 1.2515, + "step": 1560 + }, + { + "epoch": 0.7933422273044914, + "grad_norm": 3.129521608352661, + "learning_rate": 9.680448174473116e-06, + "loss": 1.2155, + "step": 1561 + }, + { + "epoch": 0.7938504542278128, + "grad_norm": 2.799604654312134, + "learning_rate": 9.679856509087847e-06, + "loss": 1.2057, + "step": 1562 + }, + { + "epoch": 0.794358681151134, + "grad_norm": 2.9921875, + "learning_rate": 9.679264314575996e-06, + "loss": 1.2361, + "step": 1563 + }, + { + "epoch": 0.7948669080744553, + "grad_norm": 2.982118606567383, + "learning_rate": 9.678671591004517e-06, + "loss": 1.2876, + "step": 1564 + }, + { + "epoch": 0.7953751349977765, + "grad_norm": 2.834472179412842, + "learning_rate": 9.678078338440426e-06, + "loss": 1.1996, + "step": 1565 + }, + { + "epoch": 0.7958833619210978, + "grad_norm": 2.7313015460968018, + "learning_rate": 9.677484556950802e-06, + "loss": 1.1582, + "step": 1566 + }, + { + "epoch": 0.7963915888444191, + "grad_norm": 2.772125244140625, + "learning_rate": 9.676890246602778e-06, + "loss": 1.1159, + "step": 1567 + }, + { + "epoch": 0.7968998157677403, + "grad_norm": 2.912230968475342, + "learning_rate": 9.676295407463551e-06, + "loss": 1.2765, + "step": 1568 + }, + { + "epoch": 0.7974080426910616, + "grad_norm": 2.979102611541748, + "learning_rate": 9.675700039600377e-06, + "loss": 1.3157, + "step": 1569 + }, + { + "epoch": 0.7979162696143828, + "grad_norm": 2.7840914726257324, + "learning_rate": 9.675104143080569e-06, + "loss": 1.1945, + "step": 1570 + }, + { + "epoch": 0.7984244965377041, + "grad_norm": 2.832731008529663, + "learning_rate": 9.674507717971502e-06, + "loss": 1.2942, + "step": 1571 + }, + { + "epoch": 0.7989327234610254, + "grad_norm": 2.896554470062256, + "learning_rate": 9.673910764340613e-06, + "loss": 1.2832, + "step": 1572 + }, + { + "epoch": 0.7994409503843466, + "grad_norm": 2.8940999507904053, + "learning_rate": 9.673313282255395e-06, + "loss": 1.2314, + "step": 1573 + }, + { + "epoch": 0.7999491773076679, + "grad_norm": 2.7886762619018555, + "learning_rate": 9.6727152717834e-06, + "loss": 1.227, + "step": 1574 + }, + { + "epoch": 0.8004574042309891, + "grad_norm": 2.9096152782440186, + "learning_rate": 9.672116732992245e-06, + "loss": 1.211, + "step": 1575 + }, + { + "epoch": 0.8009656311543104, + "grad_norm": 3.0253443717956543, + "learning_rate": 9.6715176659496e-06, + "loss": 1.2943, + "step": 1576 + }, + { + "epoch": 0.8014738580776317, + "grad_norm": 3.041499376296997, + "learning_rate": 9.670918070723206e-06, + "loss": 1.2964, + "step": 1577 + }, + { + "epoch": 0.8019820850009529, + "grad_norm": 3.052034378051758, + "learning_rate": 9.670317947380847e-06, + "loss": 1.2971, + "step": 1578 + }, + { + "epoch": 0.8024903119242742, + "grad_norm": 2.8331234455108643, + "learning_rate": 9.66971729599038e-06, + "loss": 1.2349, + "step": 1579 + }, + { + "epoch": 0.8029985388475954, + "grad_norm": 2.987531900405884, + "learning_rate": 9.669116116619717e-06, + "loss": 1.2844, + "step": 1580 + }, + { + "epoch": 0.8035067657709167, + "grad_norm": 3.0655086040496826, + "learning_rate": 9.668514409336831e-06, + "loss": 1.2412, + "step": 1581 + }, + { + "epoch": 0.8040149926942379, + "grad_norm": 2.681715965270996, + "learning_rate": 9.667912174209753e-06, + "loss": 1.1691, + "step": 1582 + }, + { + "epoch": 0.8045232196175592, + "grad_norm": 2.923539876937866, + "learning_rate": 9.667309411306574e-06, + "loss": 1.3403, + "step": 1583 + }, + { + "epoch": 0.8050314465408805, + "grad_norm": 2.8867475986480713, + "learning_rate": 9.666706120695447e-06, + "loss": 1.336, + "step": 1584 + }, + { + "epoch": 0.8055396734642017, + "grad_norm": 2.9885010719299316, + "learning_rate": 9.66610230244458e-06, + "loss": 1.2957, + "step": 1585 + }, + { + "epoch": 0.806047900387523, + "grad_norm": 2.730257749557495, + "learning_rate": 9.665497956622247e-06, + "loss": 1.1617, + "step": 1586 + }, + { + "epoch": 0.8065561273108443, + "grad_norm": 3.0298240184783936, + "learning_rate": 9.664893083296777e-06, + "loss": 1.3732, + "step": 1587 + }, + { + "epoch": 0.8070643542341656, + "grad_norm": 2.7434775829315186, + "learning_rate": 9.664287682536558e-06, + "loss": 1.1253, + "step": 1588 + }, + { + "epoch": 0.8075725811574869, + "grad_norm": 2.753551483154297, + "learning_rate": 9.663681754410038e-06, + "loss": 1.2321, + "step": 1589 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 2.7053587436676025, + "learning_rate": 9.663075298985733e-06, + "loss": 1.2795, + "step": 1590 + }, + { + "epoch": 0.8085890350041294, + "grad_norm": 2.874924898147583, + "learning_rate": 9.662468316332205e-06, + "loss": 1.2494, + "step": 1591 + }, + { + "epoch": 0.8090972619274506, + "grad_norm": 3.1453142166137695, + "learning_rate": 9.661860806518086e-06, + "loss": 1.3158, + "step": 1592 + }, + { + "epoch": 0.8096054888507719, + "grad_norm": 2.962503433227539, + "learning_rate": 9.661252769612063e-06, + "loss": 1.3158, + "step": 1593 + }, + { + "epoch": 0.8101137157740931, + "grad_norm": 3.0778138637542725, + "learning_rate": 9.660644205682884e-06, + "loss": 1.2964, + "step": 1594 + }, + { + "epoch": 0.8106219426974144, + "grad_norm": 2.989445924758911, + "learning_rate": 9.660035114799353e-06, + "loss": 1.3058, + "step": 1595 + }, + { + "epoch": 0.8111301696207357, + "grad_norm": 2.8797903060913086, + "learning_rate": 9.659425497030339e-06, + "loss": 1.1792, + "step": 1596 + }, + { + "epoch": 0.8116383965440569, + "grad_norm": 3.105631113052368, + "learning_rate": 9.65881535244477e-06, + "loss": 1.303, + "step": 1597 + }, + { + "epoch": 0.8121466234673782, + "grad_norm": 2.780606269836426, + "learning_rate": 9.658204681111628e-06, + "loss": 1.1623, + "step": 1598 + }, + { + "epoch": 0.8126548503906994, + "grad_norm": 5.6422038078308105, + "learning_rate": 9.657593483099962e-06, + "loss": 1.4302, + "step": 1599 + }, + { + "epoch": 0.8131630773140207, + "grad_norm": 3.0730020999908447, + "learning_rate": 9.656981758478875e-06, + "loss": 1.2633, + "step": 1600 + }, + { + "epoch": 0.813671304237342, + "grad_norm": 3.3350472450256348, + "learning_rate": 9.656369507317532e-06, + "loss": 1.201, + "step": 1601 + }, + { + "epoch": 0.8141795311606632, + "grad_norm": 2.7912869453430176, + "learning_rate": 9.655756729685156e-06, + "loss": 1.1654, + "step": 1602 + }, + { + "epoch": 0.8146877580839845, + "grad_norm": 2.8811697959899902, + "learning_rate": 9.655143425651033e-06, + "loss": 1.1811, + "step": 1603 + }, + { + "epoch": 0.8151959850073057, + "grad_norm": 2.713759183883667, + "learning_rate": 9.654529595284503e-06, + "loss": 1.1562, + "step": 1604 + }, + { + "epoch": 0.815704211930627, + "grad_norm": 2.927468776702881, + "learning_rate": 9.653915238654972e-06, + "loss": 1.2829, + "step": 1605 + }, + { + "epoch": 0.8162124388539483, + "grad_norm": 2.8604557514190674, + "learning_rate": 9.653300355831898e-06, + "loss": 1.2372, + "step": 1606 + }, + { + "epoch": 0.8167206657772695, + "grad_norm": 2.864851236343384, + "learning_rate": 9.652684946884806e-06, + "loss": 1.3857, + "step": 1607 + }, + { + "epoch": 0.8172288927005908, + "grad_norm": 3.0702593326568604, + "learning_rate": 9.652069011883273e-06, + "loss": 1.2066, + "step": 1608 + }, + { + "epoch": 0.817737119623912, + "grad_norm": 2.893040180206299, + "learning_rate": 9.651452550896943e-06, + "loss": 1.1917, + "step": 1609 + }, + { + "epoch": 0.8182453465472334, + "grad_norm": 2.9085614681243896, + "learning_rate": 9.650835563995516e-06, + "loss": 1.246, + "step": 1610 + }, + { + "epoch": 0.8187535734705546, + "grad_norm": 3.080528974533081, + "learning_rate": 9.65021805124875e-06, + "loss": 1.2369, + "step": 1611 + }, + { + "epoch": 0.8192618003938759, + "grad_norm": 2.8631365299224854, + "learning_rate": 9.649600012726465e-06, + "loss": 1.2071, + "step": 1612 + }, + { + "epoch": 0.8197700273171972, + "grad_norm": 3.306487560272217, + "learning_rate": 9.648981448498538e-06, + "loss": 1.2006, + "step": 1613 + }, + { + "epoch": 0.8202782542405184, + "grad_norm": 2.7040047645568848, + "learning_rate": 9.648362358634907e-06, + "loss": 1.2456, + "step": 1614 + }, + { + "epoch": 0.8207864811638397, + "grad_norm": 3.003469228744507, + "learning_rate": 9.64774274320557e-06, + "loss": 1.192, + "step": 1615 + }, + { + "epoch": 0.8212947080871609, + "grad_norm": 3.2069551944732666, + "learning_rate": 9.647122602280585e-06, + "loss": 1.3296, + "step": 1616 + }, + { + "epoch": 0.8218029350104822, + "grad_norm": 2.9010188579559326, + "learning_rate": 9.646501935930064e-06, + "loss": 1.2709, + "step": 1617 + }, + { + "epoch": 0.8223111619338035, + "grad_norm": 3.0305323600769043, + "learning_rate": 9.645880744224185e-06, + "loss": 1.2166, + "step": 1618 + }, + { + "epoch": 0.8228193888571247, + "grad_norm": 2.9393057823181152, + "learning_rate": 9.645259027233185e-06, + "loss": 1.2345, + "step": 1619 + }, + { + "epoch": 0.823327615780446, + "grad_norm": 2.836444139480591, + "learning_rate": 9.644636785027355e-06, + "loss": 1.1531, + "step": 1620 + }, + { + "epoch": 0.8238358427037672, + "grad_norm": 3.178603172302246, + "learning_rate": 9.644014017677049e-06, + "loss": 1.2349, + "step": 1621 + }, + { + "epoch": 0.8243440696270885, + "grad_norm": 2.6164798736572266, + "learning_rate": 9.64339072525268e-06, + "loss": 1.244, + "step": 1622 + }, + { + "epoch": 0.8248522965504097, + "grad_norm": 2.7259740829467773, + "learning_rate": 9.642766907824721e-06, + "loss": 1.2564, + "step": 1623 + }, + { + "epoch": 0.825360523473731, + "grad_norm": 2.822526454925537, + "learning_rate": 9.642142565463705e-06, + "loss": 1.2629, + "step": 1624 + }, + { + "epoch": 0.8258687503970523, + "grad_norm": 2.8354594707489014, + "learning_rate": 9.641517698240221e-06, + "loss": 1.2838, + "step": 1625 + }, + { + "epoch": 0.8263769773203735, + "grad_norm": 2.7072620391845703, + "learning_rate": 9.64089230622492e-06, + "loss": 1.0654, + "step": 1626 + }, + { + "epoch": 0.8268852042436948, + "grad_norm": 3.053953170776367, + "learning_rate": 9.640266389488512e-06, + "loss": 1.2494, + "step": 1627 + }, + { + "epoch": 0.827393431167016, + "grad_norm": 2.87473201751709, + "learning_rate": 9.639639948101767e-06, + "loss": 1.169, + "step": 1628 + }, + { + "epoch": 0.8279016580903373, + "grad_norm": 3.2058591842651367, + "learning_rate": 9.639012982135512e-06, + "loss": 1.2292, + "step": 1629 + }, + { + "epoch": 0.8284098850136586, + "grad_norm": 3.0206425189971924, + "learning_rate": 9.638385491660633e-06, + "loss": 1.3061, + "step": 1630 + }, + { + "epoch": 0.8289181119369798, + "grad_norm": 3.0649890899658203, + "learning_rate": 9.637757476748081e-06, + "loss": 1.2873, + "step": 1631 + }, + { + "epoch": 0.8294263388603011, + "grad_norm": 3.119568109512329, + "learning_rate": 9.637128937468862e-06, + "loss": 1.2597, + "step": 1632 + }, + { + "epoch": 0.8299345657836223, + "grad_norm": 2.910027027130127, + "learning_rate": 9.636499873894038e-06, + "loss": 1.1835, + "step": 1633 + }, + { + "epoch": 0.8304427927069437, + "grad_norm": 3.029801845550537, + "learning_rate": 9.635870286094738e-06, + "loss": 1.3794, + "step": 1634 + }, + { + "epoch": 0.830951019630265, + "grad_norm": 2.6900525093078613, + "learning_rate": 9.635240174142142e-06, + "loss": 1.2792, + "step": 1635 + }, + { + "epoch": 0.8314592465535862, + "grad_norm": 2.8703951835632324, + "learning_rate": 9.634609538107498e-06, + "loss": 1.2806, + "step": 1636 + }, + { + "epoch": 0.8319674734769075, + "grad_norm": 2.82772159576416, + "learning_rate": 9.633978378062103e-06, + "loss": 1.1742, + "step": 1637 + }, + { + "epoch": 0.8324757004002287, + "grad_norm": 3.2928287982940674, + "learning_rate": 9.633346694077324e-06, + "loss": 1.2234, + "step": 1638 + }, + { + "epoch": 0.83298392732355, + "grad_norm": 3.0190470218658447, + "learning_rate": 9.632714486224581e-06, + "loss": 1.1061, + "step": 1639 + }, + { + "epoch": 0.8334921542468712, + "grad_norm": 3.1004772186279297, + "learning_rate": 9.632081754575352e-06, + "loss": 1.325, + "step": 1640 + }, + { + "epoch": 0.8340003811701925, + "grad_norm": 2.919175386428833, + "learning_rate": 9.63144849920118e-06, + "loss": 1.2204, + "step": 1641 + }, + { + "epoch": 0.8345086080935138, + "grad_norm": 2.95920729637146, + "learning_rate": 9.630814720173662e-06, + "loss": 1.2594, + "step": 1642 + }, + { + "epoch": 0.835016835016835, + "grad_norm": 2.7796289920806885, + "learning_rate": 9.630180417564456e-06, + "loss": 1.2342, + "step": 1643 + }, + { + "epoch": 0.8355250619401563, + "grad_norm": 3.0137064456939697, + "learning_rate": 9.62954559144528e-06, + "loss": 1.315, + "step": 1644 + }, + { + "epoch": 0.8360332888634775, + "grad_norm": 2.9403417110443115, + "learning_rate": 9.628910241887908e-06, + "loss": 1.3395, + "step": 1645 + }, + { + "epoch": 0.8365415157867988, + "grad_norm": 2.85813045501709, + "learning_rate": 9.628274368964178e-06, + "loss": 1.3317, + "step": 1646 + }, + { + "epoch": 0.8370497427101201, + "grad_norm": 2.6518867015838623, + "learning_rate": 9.627637972745986e-06, + "loss": 1.1876, + "step": 1647 + }, + { + "epoch": 0.8375579696334413, + "grad_norm": 2.998403549194336, + "learning_rate": 9.627001053305283e-06, + "loss": 1.274, + "step": 1648 + }, + { + "epoch": 0.8380661965567626, + "grad_norm": 2.8829715251922607, + "learning_rate": 9.626363610714084e-06, + "loss": 1.2354, + "step": 1649 + }, + { + "epoch": 0.8385744234800838, + "grad_norm": 2.7852256298065186, + "learning_rate": 9.62572564504446e-06, + "loss": 1.2655, + "step": 1650 + }, + { + "epoch": 0.8390826504034051, + "grad_norm": 2.878523349761963, + "learning_rate": 9.625087156368541e-06, + "loss": 1.2437, + "step": 1651 + }, + { + "epoch": 0.8395908773267264, + "grad_norm": 3.0157649517059326, + "learning_rate": 9.624448144758522e-06, + "loss": 1.2135, + "step": 1652 + }, + { + "epoch": 0.8400991042500476, + "grad_norm": 2.7613508701324463, + "learning_rate": 9.623808610286652e-06, + "loss": 1.26, + "step": 1653 + }, + { + "epoch": 0.8406073311733689, + "grad_norm": 2.9558663368225098, + "learning_rate": 9.623168553025235e-06, + "loss": 1.2329, + "step": 1654 + }, + { + "epoch": 0.8411155580966901, + "grad_norm": 2.719539165496826, + "learning_rate": 9.622527973046642e-06, + "loss": 1.1355, + "step": 1655 + }, + { + "epoch": 0.8416237850200115, + "grad_norm": 2.8478665351867676, + "learning_rate": 9.6218868704233e-06, + "loss": 1.309, + "step": 1656 + }, + { + "epoch": 0.8421320119433326, + "grad_norm": 2.840024948120117, + "learning_rate": 9.621245245227695e-06, + "loss": 1.1948, + "step": 1657 + }, + { + "epoch": 0.842640238866654, + "grad_norm": 2.674862861633301, + "learning_rate": 9.620603097532373e-06, + "loss": 1.2537, + "step": 1658 + }, + { + "epoch": 0.8431484657899753, + "grad_norm": 2.6723244190216064, + "learning_rate": 9.619960427409937e-06, + "loss": 1.2343, + "step": 1659 + }, + { + "epoch": 0.8436566927132965, + "grad_norm": 2.7692830562591553, + "learning_rate": 9.619317234933049e-06, + "loss": 1.2511, + "step": 1660 + }, + { + "epoch": 0.8441649196366178, + "grad_norm": 2.7434282302856445, + "learning_rate": 9.618673520174435e-06, + "loss": 1.2742, + "step": 1661 + }, + { + "epoch": 0.844673146559939, + "grad_norm": 2.9034934043884277, + "learning_rate": 9.618029283206873e-06, + "loss": 1.3008, + "step": 1662 + }, + { + "epoch": 0.8451813734832603, + "grad_norm": 2.9145328998565674, + "learning_rate": 9.617384524103207e-06, + "loss": 1.2975, + "step": 1663 + }, + { + "epoch": 0.8456896004065816, + "grad_norm": 2.774017810821533, + "learning_rate": 9.616739242936331e-06, + "loss": 1.1945, + "step": 1664 + }, + { + "epoch": 0.8461978273299028, + "grad_norm": 2.818248748779297, + "learning_rate": 9.61609343977921e-06, + "loss": 1.3295, + "step": 1665 + }, + { + "epoch": 0.8467060542532241, + "grad_norm": 3.614201307296753, + "learning_rate": 9.615447114704858e-06, + "loss": 1.2313, + "step": 1666 + }, + { + "epoch": 0.8472142811765453, + "grad_norm": 3.3795571327209473, + "learning_rate": 9.614800267786349e-06, + "loss": 1.248, + "step": 1667 + }, + { + "epoch": 0.8477225080998666, + "grad_norm": 3.0424909591674805, + "learning_rate": 9.614152899096824e-06, + "loss": 1.2607, + "step": 1668 + }, + { + "epoch": 0.8482307350231878, + "grad_norm": 2.789071798324585, + "learning_rate": 9.613505008709475e-06, + "loss": 1.1765, + "step": 1669 + }, + { + "epoch": 0.8487389619465091, + "grad_norm": 2.9772937297821045, + "learning_rate": 9.612856596697556e-06, + "loss": 1.2276, + "step": 1670 + }, + { + "epoch": 0.8492471888698304, + "grad_norm": 3.111518144607544, + "learning_rate": 9.612207663134376e-06, + "loss": 1.2703, + "step": 1671 + }, + { + "epoch": 0.8497554157931516, + "grad_norm": 3.206437110900879, + "learning_rate": 9.611558208093313e-06, + "loss": 1.265, + "step": 1672 + }, + { + "epoch": 0.8502636427164729, + "grad_norm": 3.0687997341156006, + "learning_rate": 9.610908231647794e-06, + "loss": 1.1979, + "step": 1673 + }, + { + "epoch": 0.8507718696397941, + "grad_norm": 2.947190761566162, + "learning_rate": 9.610257733871306e-06, + "loss": 1.2856, + "step": 1674 + }, + { + "epoch": 0.8512800965631154, + "grad_norm": 2.7396671772003174, + "learning_rate": 9.609606714837401e-06, + "loss": 1.1921, + "step": 1675 + }, + { + "epoch": 0.8517883234864367, + "grad_norm": 2.6573565006256104, + "learning_rate": 9.608955174619685e-06, + "loss": 1.1377, + "step": 1676 + }, + { + "epoch": 0.8522965504097579, + "grad_norm": 3.111696481704712, + "learning_rate": 9.608303113291825e-06, + "loss": 1.2351, + "step": 1677 + }, + { + "epoch": 0.8528047773330792, + "grad_norm": 2.96317458152771, + "learning_rate": 9.607650530927545e-06, + "loss": 1.3084, + "step": 1678 + }, + { + "epoch": 0.8533130042564004, + "grad_norm": 2.9022066593170166, + "learning_rate": 9.606997427600629e-06, + "loss": 1.2549, + "step": 1679 + }, + { + "epoch": 0.8538212311797218, + "grad_norm": 2.879927158355713, + "learning_rate": 9.60634380338492e-06, + "loss": 1.2083, + "step": 1680 + }, + { + "epoch": 0.8543294581030431, + "grad_norm": 2.751678705215454, + "learning_rate": 9.60568965835432e-06, + "loss": 1.2135, + "step": 1681 + }, + { + "epoch": 0.8548376850263643, + "grad_norm": 3.1005539894104004, + "learning_rate": 9.605034992582791e-06, + "loss": 1.3971, + "step": 1682 + }, + { + "epoch": 0.8553459119496856, + "grad_norm": 2.9313011169433594, + "learning_rate": 9.604379806144351e-06, + "loss": 1.2184, + "step": 1683 + }, + { + "epoch": 0.8558541388730068, + "grad_norm": 2.909487724304199, + "learning_rate": 9.603724099113078e-06, + "loss": 1.2142, + "step": 1684 + }, + { + "epoch": 0.8563623657963281, + "grad_norm": 2.8453476428985596, + "learning_rate": 9.603067871563112e-06, + "loss": 1.2028, + "step": 1685 + }, + { + "epoch": 0.8568705927196493, + "grad_norm": 2.707455635070801, + "learning_rate": 9.602411123568647e-06, + "loss": 1.2559, + "step": 1686 + }, + { + "epoch": 0.8573788196429706, + "grad_norm": 3.0561623573303223, + "learning_rate": 9.601753855203937e-06, + "loss": 1.2467, + "step": 1687 + }, + { + "epoch": 0.8578870465662919, + "grad_norm": 2.825486898422241, + "learning_rate": 9.601096066543299e-06, + "loss": 1.2824, + "step": 1688 + }, + { + "epoch": 0.8583952734896131, + "grad_norm": 3.058521032333374, + "learning_rate": 9.600437757661102e-06, + "loss": 1.2396, + "step": 1689 + }, + { + "epoch": 0.8589035004129344, + "grad_norm": 2.9022626876831055, + "learning_rate": 9.59977892863178e-06, + "loss": 1.2501, + "step": 1690 + }, + { + "epoch": 0.8594117273362556, + "grad_norm": 2.787989616394043, + "learning_rate": 9.599119579529823e-06, + "loss": 1.2036, + "step": 1691 + }, + { + "epoch": 0.8599199542595769, + "grad_norm": 3.1896774768829346, + "learning_rate": 9.598459710429781e-06, + "loss": 1.245, + "step": 1692 + }, + { + "epoch": 0.8604281811828982, + "grad_norm": 2.805469512939453, + "learning_rate": 9.597799321406261e-06, + "loss": 1.191, + "step": 1693 + }, + { + "epoch": 0.8609364081062194, + "grad_norm": 3.0362026691436768, + "learning_rate": 9.597138412533928e-06, + "loss": 1.2462, + "step": 1694 + }, + { + "epoch": 0.8614446350295407, + "grad_norm": 2.771352767944336, + "learning_rate": 9.596476983887508e-06, + "loss": 1.2599, + "step": 1695 + }, + { + "epoch": 0.8619528619528619, + "grad_norm": 2.9952127933502197, + "learning_rate": 9.595815035541789e-06, + "loss": 1.281, + "step": 1696 + }, + { + "epoch": 0.8624610888761832, + "grad_norm": 2.7725441455841064, + "learning_rate": 9.595152567571609e-06, + "loss": 1.2921, + "step": 1697 + }, + { + "epoch": 0.8629693157995045, + "grad_norm": 2.7685930728912354, + "learning_rate": 9.594489580051872e-06, + "loss": 1.3027, + "step": 1698 + }, + { + "epoch": 0.8634775427228257, + "grad_norm": 3.058549165725708, + "learning_rate": 9.593826073057538e-06, + "loss": 1.2497, + "step": 1699 + }, + { + "epoch": 0.863985769646147, + "grad_norm": 2.9856812953948975, + "learning_rate": 9.593162046663629e-06, + "loss": 1.3705, + "step": 1700 + }, + { + "epoch": 0.8644939965694682, + "grad_norm": 2.884981870651245, + "learning_rate": 9.592497500945218e-06, + "loss": 1.2894, + "step": 1701 + }, + { + "epoch": 0.8650022234927895, + "grad_norm": 2.938297986984253, + "learning_rate": 9.591832435977446e-06, + "loss": 1.2297, + "step": 1702 + }, + { + "epoch": 0.8655104504161107, + "grad_norm": 3.102844715118408, + "learning_rate": 9.591166851835505e-06, + "loss": 1.2453, + "step": 1703 + }, + { + "epoch": 0.866018677339432, + "grad_norm": 2.9945712089538574, + "learning_rate": 9.590500748594652e-06, + "loss": 1.3084, + "step": 1704 + }, + { + "epoch": 0.8665269042627534, + "grad_norm": 2.8621790409088135, + "learning_rate": 9.589834126330198e-06, + "loss": 1.2862, + "step": 1705 + }, + { + "epoch": 0.8670351311860746, + "grad_norm": 2.7755682468414307, + "learning_rate": 9.589166985117514e-06, + "loss": 1.3119, + "step": 1706 + }, + { + "epoch": 0.8675433581093959, + "grad_norm": 2.88777494430542, + "learning_rate": 9.588499325032031e-06, + "loss": 1.4133, + "step": 1707 + }, + { + "epoch": 0.8680515850327171, + "grad_norm": 2.8970770835876465, + "learning_rate": 9.58783114614924e-06, + "loss": 1.3324, + "step": 1708 + }, + { + "epoch": 0.8685598119560384, + "grad_norm": 5.2515716552734375, + "learning_rate": 9.587162448544684e-06, + "loss": 1.2924, + "step": 1709 + }, + { + "epoch": 0.8690680388793597, + "grad_norm": 2.7246832847595215, + "learning_rate": 9.586493232293973e-06, + "loss": 1.1798, + "step": 1710 + }, + { + "epoch": 0.8695762658026809, + "grad_norm": 2.7503769397735596, + "learning_rate": 9.585823497472769e-06, + "loss": 1.1128, + "step": 1711 + }, + { + "epoch": 0.8700844927260022, + "grad_norm": 2.8117806911468506, + "learning_rate": 9.585153244156795e-06, + "loss": 1.1741, + "step": 1712 + }, + { + "epoch": 0.8705927196493234, + "grad_norm": 2.8019652366638184, + "learning_rate": 9.584482472421837e-06, + "loss": 1.3051, + "step": 1713 + }, + { + "epoch": 0.8711009465726447, + "grad_norm": 3.00313138961792, + "learning_rate": 9.58381118234373e-06, + "loss": 1.2535, + "step": 1714 + }, + { + "epoch": 0.8716091734959659, + "grad_norm": 2.6497244834899902, + "learning_rate": 9.583139373998378e-06, + "loss": 1.2638, + "step": 1715 + }, + { + "epoch": 0.8721174004192872, + "grad_norm": 2.8147075176239014, + "learning_rate": 9.58246704746174e-06, + "loss": 1.193, + "step": 1716 + }, + { + "epoch": 0.8726256273426085, + "grad_norm": 2.795912265777588, + "learning_rate": 9.581794202809824e-06, + "loss": 1.2126, + "step": 1717 + }, + { + "epoch": 0.8731338542659297, + "grad_norm": 2.7988035678863525, + "learning_rate": 9.581120840118714e-06, + "loss": 1.1986, + "step": 1718 + }, + { + "epoch": 0.873642081189251, + "grad_norm": 2.717869758605957, + "learning_rate": 9.58044695946454e-06, + "loss": 1.2796, + "step": 1719 + }, + { + "epoch": 0.8741503081125722, + "grad_norm": 2.8445379734039307, + "learning_rate": 9.579772560923493e-06, + "loss": 1.0302, + "step": 1720 + }, + { + "epoch": 0.8746585350358935, + "grad_norm": 2.7780463695526123, + "learning_rate": 9.579097644571825e-06, + "loss": 1.3045, + "step": 1721 + }, + { + "epoch": 0.8751667619592148, + "grad_norm": 2.833652973175049, + "learning_rate": 9.578422210485844e-06, + "loss": 1.133, + "step": 1722 + }, + { + "epoch": 0.875674988882536, + "grad_norm": 2.707354784011841, + "learning_rate": 9.57774625874192e-06, + "loss": 1.2762, + "step": 1723 + }, + { + "epoch": 0.8761832158058573, + "grad_norm": 3.210391044616699, + "learning_rate": 9.577069789416477e-06, + "loss": 1.1706, + "step": 1724 + }, + { + "epoch": 0.8766914427291785, + "grad_norm": 2.731499671936035, + "learning_rate": 9.576392802586001e-06, + "loss": 1.245, + "step": 1725 + }, + { + "epoch": 0.8771996696524998, + "grad_norm": 2.9754645824432373, + "learning_rate": 9.575715298327037e-06, + "loss": 1.3256, + "step": 1726 + }, + { + "epoch": 0.8777078965758212, + "grad_norm": 2.9126806259155273, + "learning_rate": 9.575037276716184e-06, + "loss": 1.3404, + "step": 1727 + }, + { + "epoch": 0.8782161234991424, + "grad_norm": 3.192377805709839, + "learning_rate": 9.574358737830103e-06, + "loss": 1.2681, + "step": 1728 + }, + { + "epoch": 0.8787243504224637, + "grad_norm": 2.8953189849853516, + "learning_rate": 9.573679681745512e-06, + "loss": 1.2454, + "step": 1729 + }, + { + "epoch": 0.8792325773457849, + "grad_norm": 3.191070795059204, + "learning_rate": 9.57300010853919e-06, + "loss": 1.269, + "step": 1730 + }, + { + "epoch": 0.8797408042691062, + "grad_norm": 3.6386911869049072, + "learning_rate": 9.572320018287973e-06, + "loss": 1.2563, + "step": 1731 + }, + { + "epoch": 0.8802490311924274, + "grad_norm": 2.961223602294922, + "learning_rate": 9.571639411068754e-06, + "loss": 1.2032, + "step": 1732 + }, + { + "epoch": 0.8807572581157487, + "grad_norm": 2.9369919300079346, + "learning_rate": 9.570958286958485e-06, + "loss": 1.2041, + "step": 1733 + }, + { + "epoch": 0.88126548503907, + "grad_norm": 2.8557302951812744, + "learning_rate": 9.570276646034178e-06, + "loss": 1.1812, + "step": 1734 + }, + { + "epoch": 0.8817737119623912, + "grad_norm": 2.7387492656707764, + "learning_rate": 9.569594488372903e-06, + "loss": 1.2181, + "step": 1735 + }, + { + "epoch": 0.8822819388857125, + "grad_norm": 2.7892708778381348, + "learning_rate": 9.568911814051787e-06, + "loss": 1.1526, + "step": 1736 + }, + { + "epoch": 0.8827901658090337, + "grad_norm": 2.80728816986084, + "learning_rate": 9.568228623148018e-06, + "loss": 1.2098, + "step": 1737 + }, + { + "epoch": 0.883298392732355, + "grad_norm": 2.7470126152038574, + "learning_rate": 9.567544915738839e-06, + "loss": 1.2536, + "step": 1738 + }, + { + "epoch": 0.8838066196556763, + "grad_norm": 2.956306219100952, + "learning_rate": 9.566860691901554e-06, + "loss": 1.2589, + "step": 1739 + }, + { + "epoch": 0.8843148465789975, + "grad_norm": 2.9518215656280518, + "learning_rate": 9.566175951713524e-06, + "loss": 1.2662, + "step": 1740 + }, + { + "epoch": 0.8848230735023188, + "grad_norm": 2.8271007537841797, + "learning_rate": 9.565490695252171e-06, + "loss": 1.2346, + "step": 1741 + }, + { + "epoch": 0.88533130042564, + "grad_norm": 2.9564075469970703, + "learning_rate": 9.56480492259497e-06, + "loss": 1.2713, + "step": 1742 + }, + { + "epoch": 0.8858395273489613, + "grad_norm": 2.854062795639038, + "learning_rate": 9.564118633819458e-06, + "loss": 1.2513, + "step": 1743 + }, + { + "epoch": 0.8863477542722826, + "grad_norm": 2.643578290939331, + "learning_rate": 9.563431829003233e-06, + "loss": 1.2893, + "step": 1744 + }, + { + "epoch": 0.8868559811956038, + "grad_norm": 2.767890691757202, + "learning_rate": 9.562744508223947e-06, + "loss": 1.32, + "step": 1745 + }, + { + "epoch": 0.8873642081189251, + "grad_norm": 2.9053843021392822, + "learning_rate": 9.562056671559312e-06, + "loss": 1.2899, + "step": 1746 + }, + { + "epoch": 0.8878724350422463, + "grad_norm": 2.75801682472229, + "learning_rate": 9.561368319087097e-06, + "loss": 1.2051, + "step": 1747 + }, + { + "epoch": 0.8883806619655676, + "grad_norm": 2.966491460800171, + "learning_rate": 9.56067945088513e-06, + "loss": 1.3499, + "step": 1748 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.8148977756500244, + "learning_rate": 9.5599900670313e-06, + "loss": 1.213, + "step": 1749 + }, + { + "epoch": 0.8893971158122101, + "grad_norm": 2.659385919570923, + "learning_rate": 9.55930016760355e-06, + "loss": 1.1845, + "step": 1750 + }, + { + "epoch": 0.8899053427355315, + "grad_norm": 2.595902919769287, + "learning_rate": 9.558609752679884e-06, + "loss": 1.1405, + "step": 1751 + }, + { + "epoch": 0.8904135696588527, + "grad_norm": 2.6552109718322754, + "learning_rate": 9.557918822338362e-06, + "loss": 1.189, + "step": 1752 + }, + { + "epoch": 0.890921796582174, + "grad_norm": 2.9055867195129395, + "learning_rate": 9.557227376657106e-06, + "loss": 1.0663, + "step": 1753 + }, + { + "epoch": 0.8914300235054952, + "grad_norm": 3.767561435699463, + "learning_rate": 9.556535415714294e-06, + "loss": 1.3009, + "step": 1754 + }, + { + "epoch": 0.8919382504288165, + "grad_norm": 2.8781096935272217, + "learning_rate": 9.555842939588162e-06, + "loss": 1.177, + "step": 1755 + }, + { + "epoch": 0.8924464773521378, + "grad_norm": 2.7181549072265625, + "learning_rate": 9.555149948357004e-06, + "loss": 1.045, + "step": 1756 + }, + { + "epoch": 0.892954704275459, + "grad_norm": 2.6964972019195557, + "learning_rate": 9.554456442099171e-06, + "loss": 1.1419, + "step": 1757 + }, + { + "epoch": 0.8934629311987803, + "grad_norm": 2.87219500541687, + "learning_rate": 9.553762420893078e-06, + "loss": 1.2508, + "step": 1758 + }, + { + "epoch": 0.8939711581221015, + "grad_norm": 2.856064558029175, + "learning_rate": 9.553067884817193e-06, + "loss": 1.189, + "step": 1759 + }, + { + "epoch": 0.8944793850454228, + "grad_norm": 2.5938351154327393, + "learning_rate": 9.552372833950041e-06, + "loss": 1.2577, + "step": 1760 + }, + { + "epoch": 0.894987611968744, + "grad_norm": 2.557764768600464, + "learning_rate": 9.551677268370212e-06, + "loss": 1.1727, + "step": 1761 + }, + { + "epoch": 0.8954958388920653, + "grad_norm": 2.9965009689331055, + "learning_rate": 9.550981188156347e-06, + "loss": 1.2943, + "step": 1762 + }, + { + "epoch": 0.8960040658153866, + "grad_norm": 2.8568296432495117, + "learning_rate": 9.550284593387148e-06, + "loss": 1.1781, + "step": 1763 + }, + { + "epoch": 0.8965122927387078, + "grad_norm": 2.8139688968658447, + "learning_rate": 9.549587484141377e-06, + "loss": 1.2641, + "step": 1764 + }, + { + "epoch": 0.8970205196620291, + "grad_norm": 3.023052930831909, + "learning_rate": 9.54888986049785e-06, + "loss": 1.2337, + "step": 1765 + }, + { + "epoch": 0.8975287465853503, + "grad_norm": 2.8153154850006104, + "learning_rate": 9.548191722535447e-06, + "loss": 1.2938, + "step": 1766 + }, + { + "epoch": 0.8980369735086716, + "grad_norm": 3.049635887145996, + "learning_rate": 9.5474930703331e-06, + "loss": 1.3754, + "step": 1767 + }, + { + "epoch": 0.8985452004319929, + "grad_norm": 2.8150997161865234, + "learning_rate": 9.546793903969801e-06, + "loss": 1.1264, + "step": 1768 + }, + { + "epoch": 0.8990534273553141, + "grad_norm": 2.751206159591675, + "learning_rate": 9.546094223524605e-06, + "loss": 1.2231, + "step": 1769 + }, + { + "epoch": 0.8995616542786354, + "grad_norm": 3.0150442123413086, + "learning_rate": 9.545394029076619e-06, + "loss": 1.2937, + "step": 1770 + }, + { + "epoch": 0.9000698812019566, + "grad_norm": 2.9299299716949463, + "learning_rate": 9.54469332070501e-06, + "loss": 1.3397, + "step": 1771 + }, + { + "epoch": 0.9005781081252779, + "grad_norm": 3.0025529861450195, + "learning_rate": 9.543992098489003e-06, + "loss": 1.2489, + "step": 1772 + }, + { + "epoch": 0.9010863350485993, + "grad_norm": 2.807588815689087, + "learning_rate": 9.543290362507882e-06, + "loss": 1.2776, + "step": 1773 + }, + { + "epoch": 0.9015945619719204, + "grad_norm": 2.946342706680298, + "learning_rate": 9.542588112840989e-06, + "loss": 1.2245, + "step": 1774 + }, + { + "epoch": 0.9021027888952418, + "grad_norm": 2.9518632888793945, + "learning_rate": 9.541885349567724e-06, + "loss": 1.3245, + "step": 1775 + }, + { + "epoch": 0.902611015818563, + "grad_norm": 2.85158109664917, + "learning_rate": 9.541182072767544e-06, + "loss": 1.1866, + "step": 1776 + }, + { + "epoch": 0.9031192427418843, + "grad_norm": 2.706902503967285, + "learning_rate": 9.540478282519963e-06, + "loss": 1.258, + "step": 1777 + }, + { + "epoch": 0.9036274696652055, + "grad_norm": 2.9308853149414062, + "learning_rate": 9.539773978904558e-06, + "loss": 1.3477, + "step": 1778 + }, + { + "epoch": 0.9041356965885268, + "grad_norm": 2.65582275390625, + "learning_rate": 9.53906916200096e-06, + "loss": 1.1828, + "step": 1779 + }, + { + "epoch": 0.9046439235118481, + "grad_norm": 2.792782783508301, + "learning_rate": 9.538363831888858e-06, + "loss": 1.2049, + "step": 1780 + }, + { + "epoch": 0.9051521504351693, + "grad_norm": 2.8841593265533447, + "learning_rate": 9.537657988647999e-06, + "loss": 1.2875, + "step": 1781 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 2.751776695251465, + "learning_rate": 9.536951632358193e-06, + "loss": 1.1579, + "step": 1782 + }, + { + "epoch": 0.9061686042818118, + "grad_norm": 2.696763753890991, + "learning_rate": 9.5362447630993e-06, + "loss": 1.186, + "step": 1783 + }, + { + "epoch": 0.9066768312051331, + "grad_norm": 2.878833293914795, + "learning_rate": 9.535537380951242e-06, + "loss": 1.1926, + "step": 1784 + }, + { + "epoch": 0.9071850581284544, + "grad_norm": 2.6030893325805664, + "learning_rate": 9.534829485994002e-06, + "loss": 1.1238, + "step": 1785 + }, + { + "epoch": 0.9076932850517756, + "grad_norm": 2.6879279613494873, + "learning_rate": 9.534121078307615e-06, + "loss": 1.1932, + "step": 1786 + }, + { + "epoch": 0.9082015119750969, + "grad_norm": 2.800438404083252, + "learning_rate": 9.533412157972179e-06, + "loss": 1.2328, + "step": 1787 + }, + { + "epoch": 0.9087097388984181, + "grad_norm": 2.800389289855957, + "learning_rate": 9.532702725067846e-06, + "loss": 1.2804, + "step": 1788 + }, + { + "epoch": 0.9092179658217394, + "grad_norm": 2.87565016746521, + "learning_rate": 9.531992779674828e-06, + "loss": 1.1231, + "step": 1789 + }, + { + "epoch": 0.9097261927450607, + "grad_norm": 2.781198501586914, + "learning_rate": 9.531282321873398e-06, + "loss": 1.1642, + "step": 1790 + }, + { + "epoch": 0.9102344196683819, + "grad_norm": 3.292746067047119, + "learning_rate": 9.530571351743881e-06, + "loss": 1.1705, + "step": 1791 + }, + { + "epoch": 0.9107426465917032, + "grad_norm": 2.8538334369659424, + "learning_rate": 9.52985986936666e-06, + "loss": 1.1693, + "step": 1792 + }, + { + "epoch": 0.9112508735150244, + "grad_norm": 2.933720588684082, + "learning_rate": 9.529147874822184e-06, + "loss": 1.1758, + "step": 1793 + }, + { + "epoch": 0.9117591004383457, + "grad_norm": 3.115551710128784, + "learning_rate": 9.528435368190952e-06, + "loss": 1.2691, + "step": 1794 + }, + { + "epoch": 0.9122673273616669, + "grad_norm": 2.8642966747283936, + "learning_rate": 9.527722349553522e-06, + "loss": 1.1481, + "step": 1795 + }, + { + "epoch": 0.9127755542849882, + "grad_norm": 3.1207451820373535, + "learning_rate": 9.527008818990513e-06, + "loss": 1.3712, + "step": 1796 + }, + { + "epoch": 0.9132837812083096, + "grad_norm": 2.7371482849121094, + "learning_rate": 9.526294776582599e-06, + "loss": 1.2768, + "step": 1797 + }, + { + "epoch": 0.9137920081316308, + "grad_norm": 3.4604902267456055, + "learning_rate": 9.525580222410512e-06, + "loss": 1.3342, + "step": 1798 + }, + { + "epoch": 0.9143002350549521, + "grad_norm": 2.8706648349761963, + "learning_rate": 9.524865156555047e-06, + "loss": 1.2667, + "step": 1799 + }, + { + "epoch": 0.9148084619782733, + "grad_norm": 2.873488426208496, + "learning_rate": 9.52414957909705e-06, + "loss": 1.2392, + "step": 1800 + }, + { + "epoch": 0.9153166889015946, + "grad_norm": 2.964588165283203, + "learning_rate": 9.523433490117427e-06, + "loss": 1.3241, + "step": 1801 + }, + { + "epoch": 0.9158249158249159, + "grad_norm": 2.9600985050201416, + "learning_rate": 9.522716889697141e-06, + "loss": 1.3308, + "step": 1802 + }, + { + "epoch": 0.9163331427482371, + "grad_norm": 2.5625863075256348, + "learning_rate": 9.521999777917219e-06, + "loss": 1.1425, + "step": 1803 + }, + { + "epoch": 0.9168413696715584, + "grad_norm": 2.7706921100616455, + "learning_rate": 9.521282154858736e-06, + "loss": 1.3258, + "step": 1804 + }, + { + "epoch": 0.9173495965948796, + "grad_norm": 2.833293914794922, + "learning_rate": 9.520564020602834e-06, + "loss": 1.2726, + "step": 1805 + }, + { + "epoch": 0.9178578235182009, + "grad_norm": 2.7428948879241943, + "learning_rate": 9.519845375230706e-06, + "loss": 1.2617, + "step": 1806 + }, + { + "epoch": 0.9183660504415221, + "grad_norm": 2.8612327575683594, + "learning_rate": 9.519126218823607e-06, + "loss": 1.178, + "step": 1807 + }, + { + "epoch": 0.9188742773648434, + "grad_norm": 2.9736928939819336, + "learning_rate": 9.518406551462847e-06, + "loss": 1.279, + "step": 1808 + }, + { + "epoch": 0.9193825042881647, + "grad_norm": 3.0132932662963867, + "learning_rate": 9.517686373229795e-06, + "loss": 1.2099, + "step": 1809 + }, + { + "epoch": 0.9198907312114859, + "grad_norm": 2.5593981742858887, + "learning_rate": 9.516965684205877e-06, + "loss": 1.1039, + "step": 1810 + }, + { + "epoch": 0.9203989581348072, + "grad_norm": 2.7686641216278076, + "learning_rate": 9.51624448447258e-06, + "loss": 1.1157, + "step": 1811 + }, + { + "epoch": 0.9209071850581284, + "grad_norm": 2.81060528755188, + "learning_rate": 9.515522774111445e-06, + "loss": 1.1971, + "step": 1812 + }, + { + "epoch": 0.9214154119814497, + "grad_norm": 2.5526318550109863, + "learning_rate": 9.514800553204071e-06, + "loss": 1.1534, + "step": 1813 + }, + { + "epoch": 0.921923638904771, + "grad_norm": 2.841200590133667, + "learning_rate": 9.514077821832118e-06, + "loss": 1.2518, + "step": 1814 + }, + { + "epoch": 0.9224318658280922, + "grad_norm": 2.7869009971618652, + "learning_rate": 9.513354580077299e-06, + "loss": 1.2512, + "step": 1815 + }, + { + "epoch": 0.9229400927514135, + "grad_norm": 2.617814302444458, + "learning_rate": 9.512630828021387e-06, + "loss": 1.1089, + "step": 1816 + }, + { + "epoch": 0.9234483196747347, + "grad_norm": 2.8492302894592285, + "learning_rate": 9.511906565746214e-06, + "loss": 1.1446, + "step": 1817 + }, + { + "epoch": 0.923956546598056, + "grad_norm": 2.7213473320007324, + "learning_rate": 9.51118179333367e-06, + "loss": 1.1777, + "step": 1818 + }, + { + "epoch": 0.9244647735213773, + "grad_norm": 3.0611300468444824, + "learning_rate": 9.510456510865697e-06, + "loss": 1.1902, + "step": 1819 + }, + { + "epoch": 0.9249730004446985, + "grad_norm": 2.8940231800079346, + "learning_rate": 9.509730718424303e-06, + "loss": 1.2389, + "step": 1820 + }, + { + "epoch": 0.9254812273680199, + "grad_norm": 3.2034969329833984, + "learning_rate": 9.509004416091548e-06, + "loss": 1.3084, + "step": 1821 + }, + { + "epoch": 0.925989454291341, + "grad_norm": 2.7354447841644287, + "learning_rate": 9.50827760394955e-06, + "loss": 1.1467, + "step": 1822 + }, + { + "epoch": 0.9264976812146624, + "grad_norm": 4.729049205780029, + "learning_rate": 9.507550282080488e-06, + "loss": 1.2631, + "step": 1823 + }, + { + "epoch": 0.9270059081379836, + "grad_norm": 3.0362253189086914, + "learning_rate": 9.506822450566595e-06, + "loss": 1.2361, + "step": 1824 + }, + { + "epoch": 0.9275141350613049, + "grad_norm": 3.075381278991699, + "learning_rate": 9.506094109490161e-06, + "loss": 1.2362, + "step": 1825 + }, + { + "epoch": 0.9280223619846262, + "grad_norm": 2.9710774421691895, + "learning_rate": 9.505365258933542e-06, + "loss": 1.3233, + "step": 1826 + }, + { + "epoch": 0.9285305889079474, + "grad_norm": 2.99249529838562, + "learning_rate": 9.504635898979138e-06, + "loss": 1.1723, + "step": 1827 + }, + { + "epoch": 0.9290388158312687, + "grad_norm": 2.88806414604187, + "learning_rate": 9.503906029709418e-06, + "loss": 1.2333, + "step": 1828 + }, + { + "epoch": 0.9295470427545899, + "grad_norm": 2.997180938720703, + "learning_rate": 9.503175651206903e-06, + "loss": 1.3472, + "step": 1829 + }, + { + "epoch": 0.9300552696779112, + "grad_norm": 2.8601789474487305, + "learning_rate": 9.502444763554174e-06, + "loss": 1.2205, + "step": 1830 + }, + { + "epoch": 0.9305634966012325, + "grad_norm": 3.0461935997009277, + "learning_rate": 9.501713366833869e-06, + "loss": 1.16, + "step": 1831 + }, + { + "epoch": 0.9310717235245537, + "grad_norm": 2.8133318424224854, + "learning_rate": 9.500981461128681e-06, + "loss": 1.2924, + "step": 1832 + }, + { + "epoch": 0.931579950447875, + "grad_norm": 2.750631809234619, + "learning_rate": 9.500249046521365e-06, + "loss": 1.2311, + "step": 1833 + }, + { + "epoch": 0.9320881773711962, + "grad_norm": 3.502110004425049, + "learning_rate": 9.49951612309473e-06, + "loss": 1.3335, + "step": 1834 + }, + { + "epoch": 0.9325964042945175, + "grad_norm": 2.9846878051757812, + "learning_rate": 9.498782690931643e-06, + "loss": 1.2773, + "step": 1835 + }, + { + "epoch": 0.9331046312178388, + "grad_norm": 2.80678653717041, + "learning_rate": 9.498048750115032e-06, + "loss": 1.1365, + "step": 1836 + }, + { + "epoch": 0.93361285814116, + "grad_norm": 3.084103584289551, + "learning_rate": 9.497314300727877e-06, + "loss": 1.297, + "step": 1837 + }, + { + "epoch": 0.9341210850644813, + "grad_norm": 2.8763110637664795, + "learning_rate": 9.49657934285322e-06, + "loss": 1.3062, + "step": 1838 + }, + { + "epoch": 0.9346293119878025, + "grad_norm": 2.8453195095062256, + "learning_rate": 9.495843876574157e-06, + "loss": 1.2479, + "step": 1839 + }, + { + "epoch": 0.9351375389111238, + "grad_norm": 2.914537191390991, + "learning_rate": 9.495107901973846e-06, + "loss": 1.2901, + "step": 1840 + }, + { + "epoch": 0.935645765834445, + "grad_norm": 2.7122802734375, + "learning_rate": 9.494371419135498e-06, + "loss": 1.1318, + "step": 1841 + }, + { + "epoch": 0.9361539927577663, + "grad_norm": 2.932257890701294, + "learning_rate": 9.493634428142383e-06, + "loss": 1.3514, + "step": 1842 + }, + { + "epoch": 0.9366622196810876, + "grad_norm": 2.784000873565674, + "learning_rate": 9.492896929077828e-06, + "loss": 1.2715, + "step": 1843 + }, + { + "epoch": 0.9371704466044088, + "grad_norm": 2.914268732070923, + "learning_rate": 9.492158922025221e-06, + "loss": 1.1562, + "step": 1844 + }, + { + "epoch": 0.9376786735277302, + "grad_norm": 2.8161864280700684, + "learning_rate": 9.491420407068002e-06, + "loss": 1.1786, + "step": 1845 + }, + { + "epoch": 0.9381869004510514, + "grad_norm": 2.703287363052368, + "learning_rate": 9.49068138428967e-06, + "loss": 1.1797, + "step": 1846 + }, + { + "epoch": 0.9386951273743727, + "grad_norm": 2.7507104873657227, + "learning_rate": 9.489941853773787e-06, + "loss": 1.2552, + "step": 1847 + }, + { + "epoch": 0.939203354297694, + "grad_norm": 3.103407859802246, + "learning_rate": 9.489201815603964e-06, + "loss": 1.2224, + "step": 1848 + }, + { + "epoch": 0.9397115812210152, + "grad_norm": 2.6951043605804443, + "learning_rate": 9.488461269863873e-06, + "loss": 1.3135, + "step": 1849 + }, + { + "epoch": 0.9402198081443365, + "grad_norm": 2.7768237590789795, + "learning_rate": 9.487720216637247e-06, + "loss": 1.0811, + "step": 1850 + }, + { + "epoch": 0.9407280350676577, + "grad_norm": 2.717684030532837, + "learning_rate": 9.486978656007869e-06, + "loss": 1.1631, + "step": 1851 + }, + { + "epoch": 0.941236261990979, + "grad_norm": 3.163203001022339, + "learning_rate": 9.486236588059585e-06, + "loss": 1.2808, + "step": 1852 + }, + { + "epoch": 0.9417444889143002, + "grad_norm": 2.7564680576324463, + "learning_rate": 9.485494012876298e-06, + "loss": 1.2187, + "step": 1853 + }, + { + "epoch": 0.9422527158376215, + "grad_norm": 2.8404791355133057, + "learning_rate": 9.484750930541964e-06, + "loss": 1.3074, + "step": 1854 + }, + { + "epoch": 0.9427609427609428, + "grad_norm": 2.8263309001922607, + "learning_rate": 9.484007341140602e-06, + "loss": 1.2831, + "step": 1855 + }, + { + "epoch": 0.943269169684264, + "grad_norm": 2.9559013843536377, + "learning_rate": 9.483263244756284e-06, + "loss": 1.162, + "step": 1856 + }, + { + "epoch": 0.9437773966075853, + "grad_norm": 2.8240835666656494, + "learning_rate": 9.482518641473144e-06, + "loss": 1.2336, + "step": 1857 + }, + { + "epoch": 0.9442856235309065, + "grad_norm": 2.7373499870300293, + "learning_rate": 9.481773531375366e-06, + "loss": 1.293, + "step": 1858 + }, + { + "epoch": 0.9447938504542278, + "grad_norm": 2.891880512237549, + "learning_rate": 9.481027914547199e-06, + "loss": 1.2538, + "step": 1859 + }, + { + "epoch": 0.9453020773775491, + "grad_norm": 2.8699028491973877, + "learning_rate": 9.480281791072944e-06, + "loss": 1.1302, + "step": 1860 + }, + { + "epoch": 0.9458103043008703, + "grad_norm": 3.3577420711517334, + "learning_rate": 9.479535161036962e-06, + "loss": 1.2419, + "step": 1861 + }, + { + "epoch": 0.9463185312241916, + "grad_norm": 3.0245659351348877, + "learning_rate": 9.478788024523673e-06, + "loss": 1.33, + "step": 1862 + }, + { + "epoch": 0.9468267581475128, + "grad_norm": 2.950090169906616, + "learning_rate": 9.478040381617546e-06, + "loss": 1.213, + "step": 1863 + }, + { + "epoch": 0.9473349850708341, + "grad_norm": 2.874415397644043, + "learning_rate": 9.477292232403118e-06, + "loss": 1.1361, + "step": 1864 + }, + { + "epoch": 0.9478432119941554, + "grad_norm": 3.1284801959991455, + "learning_rate": 9.476543576964977e-06, + "loss": 1.3103, + "step": 1865 + }, + { + "epoch": 0.9483514389174766, + "grad_norm": 2.839769124984741, + "learning_rate": 9.475794415387766e-06, + "loss": 1.2267, + "step": 1866 + }, + { + "epoch": 0.948859665840798, + "grad_norm": 2.890130043029785, + "learning_rate": 9.475044747756195e-06, + "loss": 1.158, + "step": 1867 + }, + { + "epoch": 0.9493678927641191, + "grad_norm": 2.8990070819854736, + "learning_rate": 9.474294574155022e-06, + "loss": 1.2617, + "step": 1868 + }, + { + "epoch": 0.9498761196874405, + "grad_norm": 2.893882989883423, + "learning_rate": 9.473543894669063e-06, + "loss": 1.2091, + "step": 1869 + }, + { + "epoch": 0.9503843466107617, + "grad_norm": 2.8073065280914307, + "learning_rate": 9.472792709383197e-06, + "loss": 1.2089, + "step": 1870 + }, + { + "epoch": 0.950892573534083, + "grad_norm": 2.6496944427490234, + "learning_rate": 9.472041018382354e-06, + "loss": 1.1846, + "step": 1871 + }, + { + "epoch": 0.9514008004574043, + "grad_norm": 2.8289594650268555, + "learning_rate": 9.471288821751525e-06, + "loss": 1.2576, + "step": 1872 + }, + { + "epoch": 0.9519090273807255, + "grad_norm": 2.997814893722534, + "learning_rate": 9.470536119575757e-06, + "loss": 1.2837, + "step": 1873 + }, + { + "epoch": 0.9524172543040468, + "grad_norm": 2.66351318359375, + "learning_rate": 9.469782911940151e-06, + "loss": 1.2383, + "step": 1874 + }, + { + "epoch": 0.952925481227368, + "grad_norm": 2.7139089107513428, + "learning_rate": 9.469029198929873e-06, + "loss": 1.1613, + "step": 1875 + }, + { + "epoch": 0.9534337081506893, + "grad_norm": 2.67689847946167, + "learning_rate": 9.468274980630137e-06, + "loss": 1.2042, + "step": 1876 + }, + { + "epoch": 0.9539419350740106, + "grad_norm": 2.7813730239868164, + "learning_rate": 9.467520257126223e-06, + "loss": 1.2591, + "step": 1877 + }, + { + "epoch": 0.9544501619973318, + "grad_norm": 2.801579713821411, + "learning_rate": 9.46676502850346e-06, + "loss": 1.1587, + "step": 1878 + }, + { + "epoch": 0.9549583889206531, + "grad_norm": 2.7422478199005127, + "learning_rate": 9.466009294847238e-06, + "loss": 1.2799, + "step": 1879 + }, + { + "epoch": 0.9554666158439743, + "grad_norm": 2.8934004306793213, + "learning_rate": 9.465253056243005e-06, + "loss": 1.254, + "step": 1880 + }, + { + "epoch": 0.9559748427672956, + "grad_norm": 2.6929843425750732, + "learning_rate": 9.464496312776265e-06, + "loss": 1.0316, + "step": 1881 + }, + { + "epoch": 0.9564830696906168, + "grad_norm": 2.816587209701538, + "learning_rate": 9.463739064532578e-06, + "loss": 1.253, + "step": 1882 + }, + { + "epoch": 0.9569912966139381, + "grad_norm": 2.6673052310943604, + "learning_rate": 9.462981311597563e-06, + "loss": 1.2072, + "step": 1883 + }, + { + "epoch": 0.9574995235372594, + "grad_norm": 2.825695514678955, + "learning_rate": 9.462223054056894e-06, + "loss": 1.2092, + "step": 1884 + }, + { + "epoch": 0.9580077504605806, + "grad_norm": 3.181696653366089, + "learning_rate": 9.461464291996305e-06, + "loss": 1.2547, + "step": 1885 + }, + { + "epoch": 0.9585159773839019, + "grad_norm": 2.9147400856018066, + "learning_rate": 9.460705025501581e-06, + "loss": 1.2261, + "step": 1886 + }, + { + "epoch": 0.9590242043072231, + "grad_norm": 6.87190580368042, + "learning_rate": 9.459945254658574e-06, + "loss": 1.3751, + "step": 1887 + }, + { + "epoch": 0.9595324312305444, + "grad_norm": 2.883603096008301, + "learning_rate": 9.459184979553183e-06, + "loss": 1.3314, + "step": 1888 + }, + { + "epoch": 0.9600406581538657, + "grad_norm": 2.7869436740875244, + "learning_rate": 9.45842420027137e-06, + "loss": 1.1628, + "step": 1889 + }, + { + "epoch": 0.9605488850771869, + "grad_norm": 2.8722105026245117, + "learning_rate": 9.457662916899152e-06, + "loss": 1.2581, + "step": 1890 + }, + { + "epoch": 0.9610571120005083, + "grad_norm": 2.908513069152832, + "learning_rate": 9.456901129522605e-06, + "loss": 1.2924, + "step": 1891 + }, + { + "epoch": 0.9615653389238294, + "grad_norm": 2.925353765487671, + "learning_rate": 9.456138838227857e-06, + "loss": 1.2244, + "step": 1892 + }, + { + "epoch": 0.9620735658471508, + "grad_norm": 2.8243985176086426, + "learning_rate": 9.455376043101099e-06, + "loss": 1.2406, + "step": 1893 + }, + { + "epoch": 0.9625817927704721, + "grad_norm": 2.6665141582489014, + "learning_rate": 9.454612744228572e-06, + "loss": 1.1531, + "step": 1894 + }, + { + "epoch": 0.9630900196937933, + "grad_norm": 2.6943883895874023, + "learning_rate": 9.453848941696586e-06, + "loss": 1.313, + "step": 1895 + }, + { + "epoch": 0.9635982466171146, + "grad_norm": 2.8478856086730957, + "learning_rate": 9.453084635591491e-06, + "loss": 1.2133, + "step": 1896 + }, + { + "epoch": 0.9641064735404358, + "grad_norm": 2.70573091506958, + "learning_rate": 9.45231982599971e-06, + "loss": 1.1661, + "step": 1897 + }, + { + "epoch": 0.9646147004637571, + "grad_norm": 2.684609889984131, + "learning_rate": 9.451554513007712e-06, + "loss": 1.3076, + "step": 1898 + }, + { + "epoch": 0.9651229273870783, + "grad_norm": 2.785606861114502, + "learning_rate": 9.450788696702028e-06, + "loss": 1.0978, + "step": 1899 + }, + { + "epoch": 0.9656311543103996, + "grad_norm": 2.884321689605713, + "learning_rate": 9.450022377169246e-06, + "loss": 1.2179, + "step": 1900 + }, + { + "epoch": 0.9661393812337209, + "grad_norm": 2.9700825214385986, + "learning_rate": 9.449255554496007e-06, + "loss": 1.1781, + "step": 1901 + }, + { + "epoch": 0.9666476081570421, + "grad_norm": 3.0699474811553955, + "learning_rate": 9.448488228769015e-06, + "loss": 1.3785, + "step": 1902 + }, + { + "epoch": 0.9671558350803634, + "grad_norm": 2.7597365379333496, + "learning_rate": 9.447720400075024e-06, + "loss": 1.1666, + "step": 1903 + }, + { + "epoch": 0.9676640620036846, + "grad_norm": 2.7310798168182373, + "learning_rate": 9.446952068500852e-06, + "loss": 1.2326, + "step": 1904 + }, + { + "epoch": 0.9681722889270059, + "grad_norm": 2.821917772293091, + "learning_rate": 9.446183234133367e-06, + "loss": 1.2468, + "step": 1905 + }, + { + "epoch": 0.9686805158503272, + "grad_norm": 2.7148962020874023, + "learning_rate": 9.445413897059499e-06, + "loss": 1.273, + "step": 1906 + }, + { + "epoch": 0.9691887427736484, + "grad_norm": 3.648280143737793, + "learning_rate": 9.44464405736623e-06, + "loss": 1.2404, + "step": 1907 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 2.7357401847839355, + "learning_rate": 9.443873715140606e-06, + "loss": 1.1583, + "step": 1908 + }, + { + "epoch": 0.9702051966202909, + "grad_norm": 2.8272571563720703, + "learning_rate": 9.443102870469722e-06, + "loss": 1.224, + "step": 1909 + }, + { + "epoch": 0.9707134235436122, + "grad_norm": 3.024099826812744, + "learning_rate": 9.442331523440736e-06, + "loss": 1.2522, + "step": 1910 + }, + { + "epoch": 0.9712216504669335, + "grad_norm": 2.9257144927978516, + "learning_rate": 9.441559674140859e-06, + "loss": 1.2456, + "step": 1911 + }, + { + "epoch": 0.9717298773902547, + "grad_norm": 3.5403923988342285, + "learning_rate": 9.440787322657358e-06, + "loss": 1.3027, + "step": 1912 + }, + { + "epoch": 0.972238104313576, + "grad_norm": 3.196686267852783, + "learning_rate": 9.44001446907756e-06, + "loss": 1.2347, + "step": 1913 + }, + { + "epoch": 0.9727463312368972, + "grad_norm": 2.7601497173309326, + "learning_rate": 9.439241113488849e-06, + "loss": 1.2686, + "step": 1914 + }, + { + "epoch": 0.9732545581602186, + "grad_norm": 3.1352243423461914, + "learning_rate": 9.438467255978663e-06, + "loss": 1.2042, + "step": 1915 + }, + { + "epoch": 0.9737627850835398, + "grad_norm": 2.772083044052124, + "learning_rate": 9.437692896634498e-06, + "loss": 1.2699, + "step": 1916 + }, + { + "epoch": 0.9742710120068611, + "grad_norm": 3.0568454265594482, + "learning_rate": 9.436918035543907e-06, + "loss": 1.391, + "step": 1917 + }, + { + "epoch": 0.9747792389301824, + "grad_norm": 2.8727424144744873, + "learning_rate": 9.4361426727945e-06, + "loss": 1.2516, + "step": 1918 + }, + { + "epoch": 0.9752874658535036, + "grad_norm": 2.9823689460754395, + "learning_rate": 9.43536680847394e-06, + "loss": 1.2432, + "step": 1919 + }, + { + "epoch": 0.9757956927768249, + "grad_norm": 2.8589422702789307, + "learning_rate": 9.434590442669952e-06, + "loss": 1.2263, + "step": 1920 + }, + { + "epoch": 0.9763039197001461, + "grad_norm": 2.7224597930908203, + "learning_rate": 9.433813575470318e-06, + "loss": 1.2102, + "step": 1921 + }, + { + "epoch": 0.9768121466234674, + "grad_norm": 3.058126449584961, + "learning_rate": 9.433036206962871e-06, + "loss": 1.262, + "step": 1922 + }, + { + "epoch": 0.9773203735467887, + "grad_norm": 2.858962059020996, + "learning_rate": 9.432258337235505e-06, + "loss": 1.2711, + "step": 1923 + }, + { + "epoch": 0.9778286004701099, + "grad_norm": 3.059061050415039, + "learning_rate": 9.43147996637617e-06, + "loss": 1.2354, + "step": 1924 + }, + { + "epoch": 0.9783368273934312, + "grad_norm": 2.8909220695495605, + "learning_rate": 9.43070109447287e-06, + "loss": 1.2501, + "step": 1925 + }, + { + "epoch": 0.9788450543167524, + "grad_norm": 2.7637128829956055, + "learning_rate": 9.42992172161367e-06, + "loss": 1.2199, + "step": 1926 + }, + { + "epoch": 0.9793532812400737, + "grad_norm": 2.7772271633148193, + "learning_rate": 9.429141847886692e-06, + "loss": 1.2338, + "step": 1927 + }, + { + "epoch": 0.9798615081633949, + "grad_norm": 3.01302170753479, + "learning_rate": 9.428361473380108e-06, + "loss": 1.2147, + "step": 1928 + }, + { + "epoch": 0.9803697350867162, + "grad_norm": 2.8627138137817383, + "learning_rate": 9.427580598182151e-06, + "loss": 1.2039, + "step": 1929 + }, + { + "epoch": 0.9808779620100375, + "grad_norm": 2.6455531120300293, + "learning_rate": 9.426799222381114e-06, + "loss": 1.1395, + "step": 1930 + }, + { + "epoch": 0.9813861889333587, + "grad_norm": 2.8535947799682617, + "learning_rate": 9.426017346065339e-06, + "loss": 1.2505, + "step": 1931 + }, + { + "epoch": 0.98189441585668, + "grad_norm": 2.6990885734558105, + "learning_rate": 9.425234969323231e-06, + "loss": 1.2925, + "step": 1932 + }, + { + "epoch": 0.9824026427800012, + "grad_norm": 2.916191816329956, + "learning_rate": 9.424452092243248e-06, + "loss": 1.1982, + "step": 1933 + }, + { + "epoch": 0.9829108697033225, + "grad_norm": 2.7172672748565674, + "learning_rate": 9.423668714913907e-06, + "loss": 1.2339, + "step": 1934 + }, + { + "epoch": 0.9834190966266438, + "grad_norm": 3.132009983062744, + "learning_rate": 9.42288483742378e-06, + "loss": 1.3103, + "step": 1935 + }, + { + "epoch": 0.983927323549965, + "grad_norm": 2.990915536880493, + "learning_rate": 9.422100459861494e-06, + "loss": 1.3056, + "step": 1936 + }, + { + "epoch": 0.9844355504732863, + "grad_norm": 2.8419580459594727, + "learning_rate": 9.421315582315737e-06, + "loss": 1.2209, + "step": 1937 + }, + { + "epoch": 0.9849437773966075, + "grad_norm": 2.8363163471221924, + "learning_rate": 9.420530204875252e-06, + "loss": 1.2706, + "step": 1938 + }, + { + "epoch": 0.9854520043199289, + "grad_norm": 2.7801365852355957, + "learning_rate": 9.419744327628832e-06, + "loss": 1.2744, + "step": 1939 + }, + { + "epoch": 0.9859602312432502, + "grad_norm": 3.0915050506591797, + "learning_rate": 9.418957950665336e-06, + "loss": 1.1607, + "step": 1940 + }, + { + "epoch": 0.9864684581665714, + "grad_norm": 2.951573610305786, + "learning_rate": 9.418171074073675e-06, + "loss": 1.2566, + "step": 1941 + }, + { + "epoch": 0.9869766850898927, + "grad_norm": 2.769648551940918, + "learning_rate": 9.417383697942817e-06, + "loss": 1.2288, + "step": 1942 + }, + { + "epoch": 0.9874849120132139, + "grad_norm": 2.8848860263824463, + "learning_rate": 9.416595822361786e-06, + "loss": 1.2998, + "step": 1943 + }, + { + "epoch": 0.9879931389365352, + "grad_norm": 2.8908326625823975, + "learning_rate": 9.415807447419663e-06, + "loss": 1.2915, + "step": 1944 + }, + { + "epoch": 0.9885013658598564, + "grad_norm": 2.7161648273468018, + "learning_rate": 9.415018573205588e-06, + "loss": 1.2233, + "step": 1945 + }, + { + "epoch": 0.9890095927831777, + "grad_norm": 2.799499988555908, + "learning_rate": 9.414229199808748e-06, + "loss": 1.1483, + "step": 1946 + }, + { + "epoch": 0.989517819706499, + "grad_norm": 3.000262498855591, + "learning_rate": 9.413439327318402e-06, + "loss": 1.3221, + "step": 1947 + }, + { + "epoch": 0.9900260466298202, + "grad_norm": 4.373028755187988, + "learning_rate": 9.412648955823848e-06, + "loss": 1.3722, + "step": 1948 + }, + { + "epoch": 0.9905342735531415, + "grad_norm": 2.90043306350708, + "learning_rate": 9.411858085414456e-06, + "loss": 1.2587, + "step": 1949 + }, + { + "epoch": 0.9910425004764627, + "grad_norm": 3.2279412746429443, + "learning_rate": 9.411066716179643e-06, + "loss": 1.2173, + "step": 1950 + }, + { + "epoch": 0.991550727399784, + "grad_norm": 2.9404780864715576, + "learning_rate": 9.410274848208884e-06, + "loss": 1.2789, + "step": 1951 + }, + { + "epoch": 0.9920589543231053, + "grad_norm": 2.7545523643493652, + "learning_rate": 9.409482481591713e-06, + "loss": 1.1923, + "step": 1952 + }, + { + "epoch": 0.9925671812464265, + "grad_norm": 2.863680839538574, + "learning_rate": 9.408689616417718e-06, + "loss": 1.2571, + "step": 1953 + }, + { + "epoch": 0.9930754081697478, + "grad_norm": 2.761908531188965, + "learning_rate": 9.407896252776543e-06, + "loss": 1.1544, + "step": 1954 + }, + { + "epoch": 0.993583635093069, + "grad_norm": 2.7828609943389893, + "learning_rate": 9.40710239075789e-06, + "loss": 1.2349, + "step": 1955 + }, + { + "epoch": 0.9940918620163903, + "grad_norm": 2.771557092666626, + "learning_rate": 9.406308030451519e-06, + "loss": 1.2707, + "step": 1956 + }, + { + "epoch": 0.9946000889397116, + "grad_norm": 2.988478422164917, + "learning_rate": 9.40551317194724e-06, + "loss": 1.2264, + "step": 1957 + }, + { + "epoch": 0.9951083158630328, + "grad_norm": 2.8643558025360107, + "learning_rate": 9.404717815334928e-06, + "loss": 1.287, + "step": 1958 + }, + { + "epoch": 0.9956165427863541, + "grad_norm": 2.7615389823913574, + "learning_rate": 9.403921960704507e-06, + "loss": 1.1656, + "step": 1959 + }, + { + "epoch": 0.9961247697096753, + "grad_norm": 2.893112897872925, + "learning_rate": 9.40312560814596e-06, + "loss": 1.2117, + "step": 1960 + }, + { + "epoch": 0.9966329966329966, + "grad_norm": 2.8117706775665283, + "learning_rate": 9.402328757749327e-06, + "loss": 1.2288, + "step": 1961 + }, + { + "epoch": 0.9971412235563178, + "grad_norm": 2.8521831035614014, + "learning_rate": 9.401531409604702e-06, + "loss": 1.2678, + "step": 1962 + }, + { + "epoch": 0.9976494504796392, + "grad_norm": 2.985893726348877, + "learning_rate": 9.40073356380224e-06, + "loss": 1.2412, + "step": 1963 + }, + { + "epoch": 0.9981576774029605, + "grad_norm": 2.948859453201294, + "learning_rate": 9.399935220432148e-06, + "loss": 1.3356, + "step": 1964 + }, + { + "epoch": 0.9986659043262817, + "grad_norm": 2.862870454788208, + "learning_rate": 9.39913637958469e-06, + "loss": 1.2378, + "step": 1965 + }, + { + "epoch": 0.999174131249603, + "grad_norm": 2.5982418060302734, + "learning_rate": 9.398337041350186e-06, + "loss": 1.1617, + "step": 1966 + }, + { + "epoch": 0.9996823581729242, + "grad_norm": 2.7536323070526123, + "learning_rate": 9.397537205819014e-06, + "loss": 1.2863, + "step": 1967 + }, + { + "epoch": 1.0001905850962454, + "grad_norm": 2.6883037090301514, + "learning_rate": 9.396736873081607e-06, + "loss": 0.9807, + "step": 1968 + }, + { + "epoch": 1.0006988120195668, + "grad_norm": 3.1424267292022705, + "learning_rate": 9.395936043228455e-06, + "loss": 0.8711, + "step": 1969 + }, + { + "epoch": 1.001207038942888, + "grad_norm": 2.7037315368652344, + "learning_rate": 9.395134716350103e-06, + "loss": 0.8217, + "step": 1970 + }, + { + "epoch": 1.0017152658662092, + "grad_norm": 3.0894687175750732, + "learning_rate": 9.394332892537151e-06, + "loss": 0.9446, + "step": 1971 + }, + { + "epoch": 1.0022234927895306, + "grad_norm": 2.8337249755859375, + "learning_rate": 9.39353057188026e-06, + "loss": 0.9479, + "step": 1972 + }, + { + "epoch": 1.0027317197128518, + "grad_norm": 3.023442506790161, + "learning_rate": 9.392727754470142e-06, + "loss": 0.9362, + "step": 1973 + }, + { + "epoch": 1.003239946636173, + "grad_norm": 3.1481966972351074, + "learning_rate": 9.391924440397569e-06, + "loss": 0.9307, + "step": 1974 + }, + { + "epoch": 1.0037481735594944, + "grad_norm": 3.388725519180298, + "learning_rate": 9.391120629753367e-06, + "loss": 0.809, + "step": 1975 + }, + { + "epoch": 1.0042564004828156, + "grad_norm": 3.3043911457061768, + "learning_rate": 9.390316322628417e-06, + "loss": 0.8328, + "step": 1976 + }, + { + "epoch": 1.0047646274061368, + "grad_norm": 3.6037285327911377, + "learning_rate": 9.38951151911366e-06, + "loss": 0.7763, + "step": 1977 + }, + { + "epoch": 1.005272854329458, + "grad_norm": 3.6332485675811768, + "learning_rate": 9.388706219300088e-06, + "loss": 0.9359, + "step": 1978 + }, + { + "epoch": 1.0057810812527794, + "grad_norm": 3.833462715148926, + "learning_rate": 9.387900423278756e-06, + "loss": 0.8459, + "step": 1979 + }, + { + "epoch": 1.0062893081761006, + "grad_norm": 3.7308406829833984, + "learning_rate": 9.387094131140769e-06, + "loss": 0.8102, + "step": 1980 + }, + { + "epoch": 1.0067975350994218, + "grad_norm": 3.359941005706787, + "learning_rate": 9.386287342977287e-06, + "loss": 0.8305, + "step": 1981 + }, + { + "epoch": 1.0073057620227432, + "grad_norm": 2.9804437160491943, + "learning_rate": 9.385480058879534e-06, + "loss": 0.7978, + "step": 1982 + }, + { + "epoch": 1.0078139889460644, + "grad_norm": 2.755053997039795, + "learning_rate": 9.384672278938785e-06, + "loss": 0.8343, + "step": 1983 + }, + { + "epoch": 1.0083222158693856, + "grad_norm": 3.064114809036255, + "learning_rate": 9.383864003246369e-06, + "loss": 0.9288, + "step": 1984 + }, + { + "epoch": 1.0088304427927068, + "grad_norm": 3.0753376483917236, + "learning_rate": 9.383055231893674e-06, + "loss": 0.8818, + "step": 1985 + }, + { + "epoch": 1.0093386697160283, + "grad_norm": 2.8319714069366455, + "learning_rate": 9.382245964972146e-06, + "loss": 0.8849, + "step": 1986 + }, + { + "epoch": 1.0098468966393495, + "grad_norm": 2.89162278175354, + "learning_rate": 9.38143620257328e-06, + "loss": 0.8338, + "step": 1987 + }, + { + "epoch": 1.0103551235626707, + "grad_norm": 3.2012033462524414, + "learning_rate": 9.380625944788635e-06, + "loss": 0.8047, + "step": 1988 + }, + { + "epoch": 1.010863350485992, + "grad_norm": 3.0053231716156006, + "learning_rate": 9.379815191709823e-06, + "loss": 0.8174, + "step": 1989 + }, + { + "epoch": 1.0113715774093133, + "grad_norm": 3.070302963256836, + "learning_rate": 9.379003943428508e-06, + "loss": 0.8858, + "step": 1990 + }, + { + "epoch": 1.0118798043326345, + "grad_norm": 2.9416215419769287, + "learning_rate": 9.378192200036418e-06, + "loss": 0.8167, + "step": 1991 + }, + { + "epoch": 1.012388031255956, + "grad_norm": 3.202517509460449, + "learning_rate": 9.377379961625328e-06, + "loss": 0.9251, + "step": 1992 + }, + { + "epoch": 1.012896258179277, + "grad_norm": 3.0757060050964355, + "learning_rate": 9.376567228287078e-06, + "loss": 0.8752, + "step": 1993 + }, + { + "epoch": 1.0134044851025983, + "grad_norm": 2.960498571395874, + "learning_rate": 9.375754000113555e-06, + "loss": 0.8223, + "step": 1994 + }, + { + "epoch": 1.0139127120259195, + "grad_norm": 3.186260223388672, + "learning_rate": 9.374940277196709e-06, + "loss": 0.786, + "step": 1995 + }, + { + "epoch": 1.014420938949241, + "grad_norm": 3.0160162448883057, + "learning_rate": 9.374126059628545e-06, + "loss": 0.8998, + "step": 1996 + }, + { + "epoch": 1.014929165872562, + "grad_norm": 3.1801722049713135, + "learning_rate": 9.373311347501117e-06, + "loss": 0.8987, + "step": 1997 + }, + { + "epoch": 1.0154373927958833, + "grad_norm": 3.2074148654937744, + "learning_rate": 9.372496140906546e-06, + "loss": 0.8403, + "step": 1998 + }, + { + "epoch": 1.0159456197192047, + "grad_norm": 2.9431893825531006, + "learning_rate": 9.371680439936999e-06, + "loss": 0.8974, + "step": 1999 + }, + { + "epoch": 1.016453846642526, + "grad_norm": 2.9264800548553467, + "learning_rate": 9.370864244684705e-06, + "loss": 0.8356, + "step": 2000 + }, + { + "epoch": 1.016453846642526, + "eval_loss": 1.2894710302352905, + "eval_runtime": 14.6197, + "eval_samples_per_second": 27.36, + "eval_steps_per_second": 3.42, + "step": 2000 + }, + { + "epoch": 1.0169620735658471, + "grad_norm": 3.1898536682128906, + "learning_rate": 9.370047555241947e-06, + "loss": 0.9506, + "step": 2001 + }, + { + "epoch": 1.0174703004891683, + "grad_norm": 3.170736312866211, + "learning_rate": 9.369230371701063e-06, + "loss": 0.9416, + "step": 2002 + }, + { + "epoch": 1.0179785274124897, + "grad_norm": 3.0140738487243652, + "learning_rate": 9.368412694154447e-06, + "loss": 0.7751, + "step": 2003 + }, + { + "epoch": 1.018486754335811, + "grad_norm": 3.268325090408325, + "learning_rate": 9.36759452269455e-06, + "loss": 0.9212, + "step": 2004 + }, + { + "epoch": 1.0189949812591321, + "grad_norm": 3.0660955905914307, + "learning_rate": 9.36677585741388e-06, + "loss": 0.8109, + "step": 2005 + }, + { + "epoch": 1.0195032081824535, + "grad_norm": 3.21256160736084, + "learning_rate": 9.365956698404997e-06, + "loss": 0.8029, + "step": 2006 + }, + { + "epoch": 1.0200114351057747, + "grad_norm": 3.118746757507324, + "learning_rate": 9.365137045760519e-06, + "loss": 0.8211, + "step": 2007 + }, + { + "epoch": 1.020519662029096, + "grad_norm": 3.166558027267456, + "learning_rate": 9.36431689957312e-06, + "loss": 0.8778, + "step": 2008 + }, + { + "epoch": 1.0210278889524174, + "grad_norm": 3.0696887969970703, + "learning_rate": 9.363496259935531e-06, + "loss": 0.8701, + "step": 2009 + }, + { + "epoch": 1.0215361158757386, + "grad_norm": 3.5696535110473633, + "learning_rate": 9.362675126940536e-06, + "loss": 0.9573, + "step": 2010 + }, + { + "epoch": 1.0220443427990598, + "grad_norm": 3.436431884765625, + "learning_rate": 9.361853500680976e-06, + "loss": 0.875, + "step": 2011 + }, + { + "epoch": 1.022552569722381, + "grad_norm": 3.065523862838745, + "learning_rate": 9.36103138124975e-06, + "loss": 0.789, + "step": 2012 + }, + { + "epoch": 1.0230607966457024, + "grad_norm": 3.238952875137329, + "learning_rate": 9.360208768739807e-06, + "loss": 0.8384, + "step": 2013 + }, + { + "epoch": 1.0235690235690236, + "grad_norm": 3.1629438400268555, + "learning_rate": 9.359385663244158e-06, + "loss": 0.8615, + "step": 2014 + }, + { + "epoch": 1.0240772504923448, + "grad_norm": 3.0951144695281982, + "learning_rate": 9.358562064855868e-06, + "loss": 0.8759, + "step": 2015 + }, + { + "epoch": 1.0245854774156662, + "grad_norm": 2.992215871810913, + "learning_rate": 9.357737973668056e-06, + "loss": 0.8095, + "step": 2016 + }, + { + "epoch": 1.0250937043389874, + "grad_norm": 3.3016228675842285, + "learning_rate": 9.356913389773895e-06, + "loss": 0.784, + "step": 2017 + }, + { + "epoch": 1.0256019312623086, + "grad_norm": 2.8466439247131348, + "learning_rate": 9.35608831326662e-06, + "loss": 0.7665, + "step": 2018 + }, + { + "epoch": 1.0261101581856298, + "grad_norm": 2.907198667526245, + "learning_rate": 9.355262744239517e-06, + "loss": 0.7961, + "step": 2019 + }, + { + "epoch": 1.0266183851089512, + "grad_norm": 3.4752185344696045, + "learning_rate": 9.354436682785928e-06, + "loss": 0.7864, + "step": 2020 + }, + { + "epoch": 1.0271266120322724, + "grad_norm": 3.046924352645874, + "learning_rate": 9.35361012899925e-06, + "loss": 0.7442, + "step": 2021 + }, + { + "epoch": 1.0276348389555936, + "grad_norm": 3.305177688598633, + "learning_rate": 9.35278308297294e-06, + "loss": 0.859, + "step": 2022 + }, + { + "epoch": 1.028143065878915, + "grad_norm": 3.166316270828247, + "learning_rate": 9.351955544800509e-06, + "loss": 0.8661, + "step": 2023 + }, + { + "epoch": 1.0286512928022362, + "grad_norm": 3.3500163555145264, + "learning_rate": 9.351127514575517e-06, + "loss": 0.8477, + "step": 2024 + }, + { + "epoch": 1.0291595197255574, + "grad_norm": 3.2255213260650635, + "learning_rate": 9.350298992391589e-06, + "loss": 0.8366, + "step": 2025 + }, + { + "epoch": 1.0296677466488786, + "grad_norm": 3.1444172859191895, + "learning_rate": 9.3494699783424e-06, + "loss": 0.855, + "step": 2026 + }, + { + "epoch": 1.0301759735722, + "grad_norm": 3.118273973464966, + "learning_rate": 9.348640472521682e-06, + "loss": 0.8224, + "step": 2027 + }, + { + "epoch": 1.0306842004955212, + "grad_norm": 3.104978084564209, + "learning_rate": 9.347810475023225e-06, + "loss": 0.8456, + "step": 2028 + }, + { + "epoch": 1.0311924274188424, + "grad_norm": 3.197139263153076, + "learning_rate": 9.34697998594087e-06, + "loss": 0.8209, + "step": 2029 + }, + { + "epoch": 1.0317006543421638, + "grad_norm": 3.226208448410034, + "learning_rate": 9.346149005368516e-06, + "loss": 0.928, + "step": 2030 + }, + { + "epoch": 1.032208881265485, + "grad_norm": 2.900405168533325, + "learning_rate": 9.345317533400122e-06, + "loss": 0.7765, + "step": 2031 + }, + { + "epoch": 1.0327171081888062, + "grad_norm": 3.115267038345337, + "learning_rate": 9.344485570129692e-06, + "loss": 0.814, + "step": 2032 + }, + { + "epoch": 1.0332253351121277, + "grad_norm": 3.040104866027832, + "learning_rate": 9.343653115651295e-06, + "loss": 0.7718, + "step": 2033 + }, + { + "epoch": 1.0337335620354489, + "grad_norm": 2.962225914001465, + "learning_rate": 9.34282017005905e-06, + "loss": 0.7983, + "step": 2034 + }, + { + "epoch": 1.03424178895877, + "grad_norm": 3.1415133476257324, + "learning_rate": 9.341986733447137e-06, + "loss": 0.8133, + "step": 2035 + }, + { + "epoch": 1.0347500158820913, + "grad_norm": 3.1007273197174072, + "learning_rate": 9.341152805909786e-06, + "loss": 0.7765, + "step": 2036 + }, + { + "epoch": 1.0352582428054127, + "grad_norm": 3.0376369953155518, + "learning_rate": 9.340318387541285e-06, + "loss": 0.8321, + "step": 2037 + }, + { + "epoch": 1.0357664697287339, + "grad_norm": 2.9017696380615234, + "learning_rate": 9.339483478435979e-06, + "loss": 0.8479, + "step": 2038 + }, + { + "epoch": 1.036274696652055, + "grad_norm": 6.134103775024414, + "learning_rate": 9.338648078688263e-06, + "loss": 0.7849, + "step": 2039 + }, + { + "epoch": 1.0367829235753765, + "grad_norm": 3.308187246322632, + "learning_rate": 9.337812188392596e-06, + "loss": 0.8817, + "step": 2040 + }, + { + "epoch": 1.0372911504986977, + "grad_norm": 3.367530584335327, + "learning_rate": 9.336975807643485e-06, + "loss": 0.8884, + "step": 2041 + }, + { + "epoch": 1.0377993774220189, + "grad_norm": 3.5947296619415283, + "learning_rate": 9.336138936535494e-06, + "loss": 0.929, + "step": 2042 + }, + { + "epoch": 1.03830760434534, + "grad_norm": 3.12381649017334, + "learning_rate": 9.335301575163247e-06, + "loss": 0.7718, + "step": 2043 + }, + { + "epoch": 1.0388158312686615, + "grad_norm": 3.505775213241577, + "learning_rate": 9.334463723621415e-06, + "loss": 0.8644, + "step": 2044 + }, + { + "epoch": 1.0393240581919827, + "grad_norm": 3.172312021255493, + "learning_rate": 9.333625382004734e-06, + "loss": 0.8577, + "step": 2045 + }, + { + "epoch": 1.039832285115304, + "grad_norm": 3.0678904056549072, + "learning_rate": 9.332786550407989e-06, + "loss": 0.7207, + "step": 2046 + }, + { + "epoch": 1.0403405120386253, + "grad_norm": 2.993863105773926, + "learning_rate": 9.331947228926024e-06, + "loss": 0.7157, + "step": 2047 + }, + { + "epoch": 1.0408487389619465, + "grad_norm": 3.0315968990325928, + "learning_rate": 9.331107417653734e-06, + "loss": 0.8081, + "step": 2048 + }, + { + "epoch": 1.0413569658852677, + "grad_norm": 3.491834878921509, + "learning_rate": 9.330267116686072e-06, + "loss": 0.9326, + "step": 2049 + }, + { + "epoch": 1.0418651928085891, + "grad_norm": 2.9259064197540283, + "learning_rate": 9.32942632611805e-06, + "loss": 0.8041, + "step": 2050 + }, + { + "epoch": 1.0423734197319103, + "grad_norm": 3.325554847717285, + "learning_rate": 9.328585046044728e-06, + "loss": 0.8363, + "step": 2051 + }, + { + "epoch": 1.0428816466552315, + "grad_norm": 3.138277053833008, + "learning_rate": 9.327743276561226e-06, + "loss": 0.8907, + "step": 2052 + }, + { + "epoch": 1.0433898735785527, + "grad_norm": 2.964484214782715, + "learning_rate": 9.32690101776272e-06, + "loss": 0.9177, + "step": 2053 + }, + { + "epoch": 1.0438981005018741, + "grad_norm": 3.1464931964874268, + "learning_rate": 9.326058269744436e-06, + "loss": 0.7592, + "step": 2054 + }, + { + "epoch": 1.0444063274251953, + "grad_norm": 2.9225363731384277, + "learning_rate": 9.325215032601664e-06, + "loss": 0.8515, + "step": 2055 + }, + { + "epoch": 1.0449145543485165, + "grad_norm": 6.968871116638184, + "learning_rate": 9.32437130642974e-06, + "loss": 0.8868, + "step": 2056 + }, + { + "epoch": 1.045422781271838, + "grad_norm": 2.892380714416504, + "learning_rate": 9.323527091324062e-06, + "loss": 0.7601, + "step": 2057 + }, + { + "epoch": 1.0459310081951592, + "grad_norm": 3.065734386444092, + "learning_rate": 9.322682387380082e-06, + "loss": 0.9312, + "step": 2058 + }, + { + "epoch": 1.0464392351184804, + "grad_norm": 3.2454822063446045, + "learning_rate": 9.321837194693304e-06, + "loss": 0.8848, + "step": 2059 + }, + { + "epoch": 1.0469474620418016, + "grad_norm": 3.0628859996795654, + "learning_rate": 9.32099151335929e-06, + "loss": 0.8395, + "step": 2060 + }, + { + "epoch": 1.047455688965123, + "grad_norm": 2.7631242275238037, + "learning_rate": 9.320145343473656e-06, + "loss": 0.6984, + "step": 2061 + }, + { + "epoch": 1.0479639158884442, + "grad_norm": 3.4513697624206543, + "learning_rate": 9.319298685132076e-06, + "loss": 0.8301, + "step": 2062 + }, + { + "epoch": 1.0484721428117654, + "grad_norm": 3.0544557571411133, + "learning_rate": 9.318451538430277e-06, + "loss": 0.8076, + "step": 2063 + }, + { + "epoch": 1.0489803697350868, + "grad_norm": 3.6341161727905273, + "learning_rate": 9.31760390346404e-06, + "loss": 0.9105, + "step": 2064 + }, + { + "epoch": 1.049488596658408, + "grad_norm": 3.331022262573242, + "learning_rate": 9.316755780329201e-06, + "loss": 0.8577, + "step": 2065 + }, + { + "epoch": 1.0499968235817292, + "grad_norm": 3.1098642349243164, + "learning_rate": 9.315907169121657e-06, + "loss": 0.7183, + "step": 2066 + }, + { + "epoch": 1.0505050505050506, + "grad_norm": 3.2029342651367188, + "learning_rate": 9.315058069937352e-06, + "loss": 0.8624, + "step": 2067 + }, + { + "epoch": 1.0510132774283718, + "grad_norm": 2.9517741203308105, + "learning_rate": 9.31420848287229e-06, + "loss": 0.8058, + "step": 2068 + }, + { + "epoch": 1.051521504351693, + "grad_norm": 3.1466259956359863, + "learning_rate": 9.313358408022533e-06, + "loss": 0.868, + "step": 2069 + }, + { + "epoch": 1.0520297312750142, + "grad_norm": 3.0388054847717285, + "learning_rate": 9.31250784548419e-06, + "loss": 0.855, + "step": 2070 + }, + { + "epoch": 1.0525379581983356, + "grad_norm": 2.9505624771118164, + "learning_rate": 9.311656795353431e-06, + "loss": 0.7738, + "step": 2071 + }, + { + "epoch": 1.0530461851216568, + "grad_norm": 3.3491604328155518, + "learning_rate": 9.31080525772648e-06, + "loss": 0.8004, + "step": 2072 + }, + { + "epoch": 1.053554412044978, + "grad_norm": 2.904555082321167, + "learning_rate": 9.309953232699617e-06, + "loss": 0.8718, + "step": 2073 + }, + { + "epoch": 1.0540626389682994, + "grad_norm": 2.8941566944122314, + "learning_rate": 9.309100720369176e-06, + "loss": 0.7971, + "step": 2074 + }, + { + "epoch": 1.0545708658916206, + "grad_norm": 3.0532689094543457, + "learning_rate": 9.308247720831542e-06, + "loss": 0.8472, + "step": 2075 + }, + { + "epoch": 1.0550790928149418, + "grad_norm": 3.448359489440918, + "learning_rate": 9.307394234183162e-06, + "loss": 0.8943, + "step": 2076 + }, + { + "epoch": 1.055587319738263, + "grad_norm": 3.1240499019622803, + "learning_rate": 9.306540260520535e-06, + "loss": 0.9552, + "step": 2077 + }, + { + "epoch": 1.0560955466615845, + "grad_norm": 3.350869655609131, + "learning_rate": 9.305685799940218e-06, + "loss": 0.8265, + "step": 2078 + }, + { + "epoch": 1.0566037735849056, + "grad_norm": 3.4039957523345947, + "learning_rate": 9.304830852538817e-06, + "loss": 0.8602, + "step": 2079 + }, + { + "epoch": 1.0571120005082268, + "grad_norm": 3.3318874835968018, + "learning_rate": 9.303975418412996e-06, + "loss": 0.9006, + "step": 2080 + }, + { + "epoch": 1.0576202274315483, + "grad_norm": 2.9756813049316406, + "learning_rate": 9.303119497659476e-06, + "loss": 0.8273, + "step": 2081 + }, + { + "epoch": 1.0581284543548695, + "grad_norm": 2.9196019172668457, + "learning_rate": 9.302263090375032e-06, + "loss": 0.8361, + "step": 2082 + }, + { + "epoch": 1.0586366812781907, + "grad_norm": 3.343363046646118, + "learning_rate": 9.30140619665649e-06, + "loss": 0.8124, + "step": 2083 + }, + { + "epoch": 1.059144908201512, + "grad_norm": 3.252643585205078, + "learning_rate": 9.300548816600739e-06, + "loss": 0.8564, + "step": 2084 + }, + { + "epoch": 1.0596531351248333, + "grad_norm": 3.0798771381378174, + "learning_rate": 9.299690950304716e-06, + "loss": 0.8804, + "step": 2085 + }, + { + "epoch": 1.0601613620481545, + "grad_norm": 3.143292188644409, + "learning_rate": 9.298832597865416e-06, + "loss": 0.8426, + "step": 2086 + }, + { + "epoch": 1.0606695889714757, + "grad_norm": 3.0367817878723145, + "learning_rate": 9.297973759379888e-06, + "loss": 0.8423, + "step": 2087 + }, + { + "epoch": 1.061177815894797, + "grad_norm": 3.2499539852142334, + "learning_rate": 9.297114434945236e-06, + "loss": 0.9039, + "step": 2088 + }, + { + "epoch": 1.0616860428181183, + "grad_norm": 3.1852822303771973, + "learning_rate": 9.296254624658618e-06, + "loss": 0.7962, + "step": 2089 + }, + { + "epoch": 1.0621942697414395, + "grad_norm": 3.22925066947937, + "learning_rate": 9.295394328617251e-06, + "loss": 0.7997, + "step": 2090 + }, + { + "epoch": 1.062702496664761, + "grad_norm": 3.0404813289642334, + "learning_rate": 9.294533546918406e-06, + "loss": 0.8152, + "step": 2091 + }, + { + "epoch": 1.063210723588082, + "grad_norm": 3.014554977416992, + "learning_rate": 9.2936722796594e-06, + "loss": 0.8412, + "step": 2092 + }, + { + "epoch": 1.0637189505114033, + "grad_norm": 3.2184641361236572, + "learning_rate": 9.292810526937617e-06, + "loss": 0.8574, + "step": 2093 + }, + { + "epoch": 1.0642271774347245, + "grad_norm": 3.2080061435699463, + "learning_rate": 9.29194828885049e-06, + "loss": 0.8677, + "step": 2094 + }, + { + "epoch": 1.064735404358046, + "grad_norm": 3.276824474334717, + "learning_rate": 9.291085565495508e-06, + "loss": 0.8431, + "step": 2095 + }, + { + "epoch": 1.0652436312813671, + "grad_norm": 3.0697712898254395, + "learning_rate": 9.290222356970213e-06, + "loss": 0.9106, + "step": 2096 + }, + { + "epoch": 1.0657518582046883, + "grad_norm": 3.019782066345215, + "learning_rate": 9.289358663372204e-06, + "loss": 0.7905, + "step": 2097 + }, + { + "epoch": 1.0662600851280097, + "grad_norm": 3.2518410682678223, + "learning_rate": 9.288494484799136e-06, + "loss": 0.8393, + "step": 2098 + }, + { + "epoch": 1.066768312051331, + "grad_norm": 2.8931727409362793, + "learning_rate": 9.287629821348714e-06, + "loss": 0.7574, + "step": 2099 + }, + { + "epoch": 1.0672765389746521, + "grad_norm": 3.020138740539551, + "learning_rate": 9.286764673118705e-06, + "loss": 0.7832, + "step": 2100 + }, + { + "epoch": 1.0677847658979736, + "grad_norm": 3.068448305130005, + "learning_rate": 9.285899040206922e-06, + "loss": 0.7436, + "step": 2101 + }, + { + "epoch": 1.0682929928212948, + "grad_norm": 3.2184550762176514, + "learning_rate": 9.28503292271124e-06, + "loss": 0.9075, + "step": 2102 + }, + { + "epoch": 1.068801219744616, + "grad_norm": 2.9750399589538574, + "learning_rate": 9.284166320729588e-06, + "loss": 0.8305, + "step": 2103 + }, + { + "epoch": 1.0693094466679371, + "grad_norm": 3.4522347450256348, + "learning_rate": 9.283299234359946e-06, + "loss": 0.7978, + "step": 2104 + }, + { + "epoch": 1.0698176735912586, + "grad_norm": 3.1621932983398438, + "learning_rate": 9.28243166370035e-06, + "loss": 0.8388, + "step": 2105 + }, + { + "epoch": 1.0703259005145798, + "grad_norm": 3.238377809524536, + "learning_rate": 9.281563608848893e-06, + "loss": 0.7583, + "step": 2106 + }, + { + "epoch": 1.070834127437901, + "grad_norm": 3.1495258808135986, + "learning_rate": 9.280695069903722e-06, + "loss": 0.7382, + "step": 2107 + }, + { + "epoch": 1.0713423543612224, + "grad_norm": 3.1268153190612793, + "learning_rate": 9.279826046963037e-06, + "loss": 0.7512, + "step": 2108 + }, + { + "epoch": 1.0718505812845436, + "grad_norm": 3.2700624465942383, + "learning_rate": 9.278956540125094e-06, + "loss": 0.7999, + "step": 2109 + }, + { + "epoch": 1.0723588082078648, + "grad_norm": 2.898972272872925, + "learning_rate": 9.278086549488203e-06, + "loss": 0.7911, + "step": 2110 + }, + { + "epoch": 1.072867035131186, + "grad_norm": 3.0485572814941406, + "learning_rate": 9.27721607515073e-06, + "loss": 0.7897, + "step": 2111 + }, + { + "epoch": 1.0733752620545074, + "grad_norm": 2.9671947956085205, + "learning_rate": 9.276345117211096e-06, + "loss": 0.8024, + "step": 2112 + }, + { + "epoch": 1.0738834889778286, + "grad_norm": 3.489755868911743, + "learning_rate": 9.275473675767773e-06, + "loss": 0.8729, + "step": 2113 + }, + { + "epoch": 1.0743917159011498, + "grad_norm": 3.384394645690918, + "learning_rate": 9.274601750919292e-06, + "loss": 0.8471, + "step": 2114 + }, + { + "epoch": 1.0748999428244712, + "grad_norm": 3.0558526515960693, + "learning_rate": 9.273729342764237e-06, + "loss": 0.801, + "step": 2115 + }, + { + "epoch": 1.0754081697477924, + "grad_norm": 3.1915698051452637, + "learning_rate": 9.272856451401246e-06, + "loss": 0.8724, + "step": 2116 + }, + { + "epoch": 1.0759163966711136, + "grad_norm": 3.234802722930908, + "learning_rate": 9.271983076929012e-06, + "loss": 0.8306, + "step": 2117 + }, + { + "epoch": 1.076424623594435, + "grad_norm": 3.1662769317626953, + "learning_rate": 9.271109219446282e-06, + "loss": 0.8037, + "step": 2118 + }, + { + "epoch": 1.0769328505177562, + "grad_norm": 3.228738784790039, + "learning_rate": 9.270234879051861e-06, + "loss": 0.7598, + "step": 2119 + }, + { + "epoch": 1.0774410774410774, + "grad_norm": 11.493374824523926, + "learning_rate": 9.269360055844605e-06, + "loss": 0.8335, + "step": 2120 + }, + { + "epoch": 1.0779493043643986, + "grad_norm": 3.0331759452819824, + "learning_rate": 9.268484749923424e-06, + "loss": 0.6947, + "step": 2121 + }, + { + "epoch": 1.07845753128772, + "grad_norm": 3.314284563064575, + "learning_rate": 9.267608961387287e-06, + "loss": 0.909, + "step": 2122 + }, + { + "epoch": 1.0789657582110412, + "grad_norm": 3.0632483959198, + "learning_rate": 9.266732690335211e-06, + "loss": 0.8805, + "step": 2123 + }, + { + "epoch": 1.0794739851343624, + "grad_norm": 3.0312142372131348, + "learning_rate": 9.265855936866276e-06, + "loss": 0.8584, + "step": 2124 + }, + { + "epoch": 1.0799822120576839, + "grad_norm": 3.4391958713531494, + "learning_rate": 9.264978701079607e-06, + "loss": 0.7548, + "step": 2125 + }, + { + "epoch": 1.080490438981005, + "grad_norm": 2.9293901920318604, + "learning_rate": 9.264100983074394e-06, + "loss": 0.8314, + "step": 2126 + }, + { + "epoch": 1.0809986659043263, + "grad_norm": 3.2253024578094482, + "learning_rate": 9.26322278294987e-06, + "loss": 0.9104, + "step": 2127 + }, + { + "epoch": 1.0815068928276474, + "grad_norm": 3.0602898597717285, + "learning_rate": 9.262344100805332e-06, + "loss": 0.78, + "step": 2128 + }, + { + "epoch": 1.0820151197509689, + "grad_norm": 3.211329460144043, + "learning_rate": 9.261464936740127e-06, + "loss": 0.8241, + "step": 2129 + }, + { + "epoch": 1.08252334667429, + "grad_norm": 2.9432098865509033, + "learning_rate": 9.260585290853658e-06, + "loss": 0.7371, + "step": 2130 + }, + { + "epoch": 1.0830315735976113, + "grad_norm": 3.190213203430176, + "learning_rate": 9.259705163245381e-06, + "loss": 0.909, + "step": 2131 + }, + { + "epoch": 1.0835398005209327, + "grad_norm": 3.1257691383361816, + "learning_rate": 9.258824554014807e-06, + "loss": 0.8234, + "step": 2132 + }, + { + "epoch": 1.0840480274442539, + "grad_norm": 2.958376884460449, + "learning_rate": 9.257943463261503e-06, + "loss": 0.8303, + "step": 2133 + }, + { + "epoch": 1.084556254367575, + "grad_norm": 3.43859601020813, + "learning_rate": 9.257061891085091e-06, + "loss": 0.7861, + "step": 2134 + }, + { + "epoch": 1.0850644812908965, + "grad_norm": 2.9984450340270996, + "learning_rate": 9.256179837585242e-06, + "loss": 0.7126, + "step": 2135 + }, + { + "epoch": 1.0855727082142177, + "grad_norm": 3.1922214031219482, + "learning_rate": 9.255297302861685e-06, + "loss": 0.8999, + "step": 2136 + }, + { + "epoch": 1.086080935137539, + "grad_norm": 2.9793853759765625, + "learning_rate": 9.254414287014208e-06, + "loss": 0.8929, + "step": 2137 + }, + { + "epoch": 1.08658916206086, + "grad_norm": 3.271268129348755, + "learning_rate": 9.253530790142646e-06, + "loss": 0.8677, + "step": 2138 + }, + { + "epoch": 1.0870973889841815, + "grad_norm": 3.011582612991333, + "learning_rate": 9.25264681234689e-06, + "loss": 0.846, + "step": 2139 + }, + { + "epoch": 1.0876056159075027, + "grad_norm": 3.042726755142212, + "learning_rate": 9.251762353726887e-06, + "loss": 0.7305, + "step": 2140 + }, + { + "epoch": 1.088113842830824, + "grad_norm": 3.29084849357605, + "learning_rate": 9.250877414382641e-06, + "loss": 0.8388, + "step": 2141 + }, + { + "epoch": 1.0886220697541453, + "grad_norm": 3.143230676651001, + "learning_rate": 9.249991994414207e-06, + "loss": 0.9816, + "step": 2142 + }, + { + "epoch": 1.0891302966774665, + "grad_norm": 2.8965611457824707, + "learning_rate": 9.249106093921692e-06, + "loss": 0.7588, + "step": 2143 + }, + { + "epoch": 1.0896385236007877, + "grad_norm": 3.2397620677948, + "learning_rate": 9.24821971300526e-06, + "loss": 0.8879, + "step": 2144 + }, + { + "epoch": 1.090146750524109, + "grad_norm": 2.9761197566986084, + "learning_rate": 9.247332851765134e-06, + "loss": 0.797, + "step": 2145 + }, + { + "epoch": 1.0906549774474303, + "grad_norm": 3.0833804607391357, + "learning_rate": 9.24644551030158e-06, + "loss": 0.8104, + "step": 2146 + }, + { + "epoch": 1.0911632043707515, + "grad_norm": 2.98724365234375, + "learning_rate": 9.24555768871493e-06, + "loss": 0.812, + "step": 2147 + }, + { + "epoch": 1.0916714312940727, + "grad_norm": 3.2756662368774414, + "learning_rate": 9.244669387105563e-06, + "loss": 0.9076, + "step": 2148 + }, + { + "epoch": 1.0921796582173942, + "grad_norm": 3.199113130569458, + "learning_rate": 9.243780605573918e-06, + "loss": 0.8027, + "step": 2149 + }, + { + "epoch": 1.0926878851407154, + "grad_norm": 2.9473695755004883, + "learning_rate": 9.24289134422048e-06, + "loss": 0.8426, + "step": 2150 + }, + { + "epoch": 1.0931961120640366, + "grad_norm": 3.1321775913238525, + "learning_rate": 9.242001603145795e-06, + "loss": 0.8629, + "step": 2151 + }, + { + "epoch": 1.093704338987358, + "grad_norm": 3.111842155456543, + "learning_rate": 9.241111382450463e-06, + "loss": 0.8082, + "step": 2152 + }, + { + "epoch": 1.0942125659106792, + "grad_norm": 4.241421699523926, + "learning_rate": 9.240220682235133e-06, + "loss": 0.8441, + "step": 2153 + }, + { + "epoch": 1.0947207928340004, + "grad_norm": 3.283623218536377, + "learning_rate": 9.239329502600515e-06, + "loss": 0.7652, + "step": 2154 + }, + { + "epoch": 1.0952290197573216, + "grad_norm": 2.9305100440979004, + "learning_rate": 9.23843784364737e-06, + "loss": 0.8427, + "step": 2155 + }, + { + "epoch": 1.095737246680643, + "grad_norm": 2.994626998901367, + "learning_rate": 9.23754570547651e-06, + "loss": 0.7648, + "step": 2156 + }, + { + "epoch": 1.0962454736039642, + "grad_norm": 3.076044797897339, + "learning_rate": 9.236653088188807e-06, + "loss": 0.7861, + "step": 2157 + }, + { + "epoch": 1.0967537005272854, + "grad_norm": 3.4667749404907227, + "learning_rate": 9.235759991885185e-06, + "loss": 0.9786, + "step": 2158 + }, + { + "epoch": 1.0972619274506068, + "grad_norm": 3.2529866695404053, + "learning_rate": 9.234866416666619e-06, + "loss": 0.784, + "step": 2159 + }, + { + "epoch": 1.097770154373928, + "grad_norm": 3.1599793434143066, + "learning_rate": 9.233972362634143e-06, + "loss": 0.96, + "step": 2160 + }, + { + "epoch": 1.0982783812972492, + "grad_norm": 3.1152777671813965, + "learning_rate": 9.233077829888841e-06, + "loss": 0.7875, + "step": 2161 + }, + { + "epoch": 1.0987866082205704, + "grad_norm": 3.0375049114227295, + "learning_rate": 9.232182818531856e-06, + "loss": 0.9108, + "step": 2162 + }, + { + "epoch": 1.0992948351438918, + "grad_norm": 2.9311556816101074, + "learning_rate": 9.23128732866438e-06, + "loss": 0.8091, + "step": 2163 + }, + { + "epoch": 1.099803062067213, + "grad_norm": 2.9771041870117188, + "learning_rate": 9.230391360387661e-06, + "loss": 0.8187, + "step": 2164 + }, + { + "epoch": 1.1003112889905342, + "grad_norm": 3.184452533721924, + "learning_rate": 9.229494913803003e-06, + "loss": 0.7583, + "step": 2165 + }, + { + "epoch": 1.1008195159138556, + "grad_norm": 3.0859079360961914, + "learning_rate": 9.228597989011761e-06, + "loss": 0.813, + "step": 2166 + }, + { + "epoch": 1.1013277428371768, + "grad_norm": 3.111276865005493, + "learning_rate": 9.227700586115347e-06, + "loss": 0.7791, + "step": 2167 + }, + { + "epoch": 1.101835969760498, + "grad_norm": 3.0945050716400146, + "learning_rate": 9.226802705215224e-06, + "loss": 0.8495, + "step": 2168 + }, + { + "epoch": 1.1023441966838192, + "grad_norm": 3.492349863052368, + "learning_rate": 9.225904346412913e-06, + "loss": 0.8259, + "step": 2169 + }, + { + "epoch": 1.1028524236071406, + "grad_norm": 3.0135536193847656, + "learning_rate": 9.225005509809984e-06, + "loss": 0.7308, + "step": 2170 + }, + { + "epoch": 1.1033606505304618, + "grad_norm": 3.3793108463287354, + "learning_rate": 9.224106195508064e-06, + "loss": 0.8777, + "step": 2171 + }, + { + "epoch": 1.103868877453783, + "grad_norm": 3.311250925064087, + "learning_rate": 9.223206403608836e-06, + "loss": 0.8091, + "step": 2172 + }, + { + "epoch": 1.1043771043771045, + "grad_norm": 3.3394904136657715, + "learning_rate": 9.222306134214032e-06, + "loss": 0.898, + "step": 2173 + }, + { + "epoch": 1.1048853313004257, + "grad_norm": 2.9980368614196777, + "learning_rate": 9.221405387425441e-06, + "loss": 0.8628, + "step": 2174 + }, + { + "epoch": 1.1053935582237469, + "grad_norm": 3.0090014934539795, + "learning_rate": 9.22050416334491e-06, + "loss": 0.8591, + "step": 2175 + }, + { + "epoch": 1.105901785147068, + "grad_norm": 3.2262046337127686, + "learning_rate": 9.21960246207433e-06, + "loss": 0.9521, + "step": 2176 + }, + { + "epoch": 1.1064100120703895, + "grad_norm": 3.0029313564300537, + "learning_rate": 9.218700283715653e-06, + "loss": 0.9119, + "step": 2177 + }, + { + "epoch": 1.1069182389937107, + "grad_norm": 2.9279654026031494, + "learning_rate": 9.217797628370886e-06, + "loss": 0.8419, + "step": 2178 + }, + { + "epoch": 1.1074264659170319, + "grad_norm": 3.0237679481506348, + "learning_rate": 9.216894496142083e-06, + "loss": 0.8855, + "step": 2179 + }, + { + "epoch": 1.1079346928403533, + "grad_norm": 3.1915111541748047, + "learning_rate": 9.215990887131362e-06, + "loss": 0.9484, + "step": 2180 + }, + { + "epoch": 1.1084429197636745, + "grad_norm": 3.263805627822876, + "learning_rate": 9.215086801440885e-06, + "loss": 0.9143, + "step": 2181 + }, + { + "epoch": 1.1089511466869957, + "grad_norm": 2.8310515880584717, + "learning_rate": 9.214182239172875e-06, + "loss": 0.7704, + "step": 2182 + }, + { + "epoch": 1.109459373610317, + "grad_norm": 3.0871376991271973, + "learning_rate": 9.213277200429604e-06, + "loss": 0.9276, + "step": 2183 + }, + { + "epoch": 1.1099676005336383, + "grad_norm": 3.289386749267578, + "learning_rate": 9.2123716853134e-06, + "loss": 0.8827, + "step": 2184 + }, + { + "epoch": 1.1104758274569595, + "grad_norm": 3.0301473140716553, + "learning_rate": 9.211465693926644e-06, + "loss": 0.6892, + "step": 2185 + }, + { + "epoch": 1.1109840543802807, + "grad_norm": 3.2088818550109863, + "learning_rate": 9.210559226371775e-06, + "loss": 0.8858, + "step": 2186 + }, + { + "epoch": 1.1114922813036021, + "grad_norm": 3.0917153358459473, + "learning_rate": 9.20965228275128e-06, + "loss": 0.8285, + "step": 2187 + }, + { + "epoch": 1.1120005082269233, + "grad_norm": 3.0714948177337646, + "learning_rate": 9.208744863167704e-06, + "loss": 0.7709, + "step": 2188 + }, + { + "epoch": 1.1125087351502445, + "grad_norm": 3.212080955505371, + "learning_rate": 9.207836967723642e-06, + "loss": 0.8698, + "step": 2189 + }, + { + "epoch": 1.113016962073566, + "grad_norm": 2.982008695602417, + "learning_rate": 9.206928596521745e-06, + "loss": 0.8373, + "step": 2190 + }, + { + "epoch": 1.1135251889968871, + "grad_norm": 2.828354597091675, + "learning_rate": 9.206019749664721e-06, + "loss": 0.8131, + "step": 2191 + }, + { + "epoch": 1.1140334159202083, + "grad_norm": 2.826298952102661, + "learning_rate": 9.205110427255325e-06, + "loss": 0.824, + "step": 2192 + }, + { + "epoch": 1.1145416428435295, + "grad_norm": 3.315394878387451, + "learning_rate": 9.204200629396369e-06, + "loss": 0.9247, + "step": 2193 + }, + { + "epoch": 1.115049869766851, + "grad_norm": 2.9893481731414795, + "learning_rate": 9.203290356190722e-06, + "loss": 0.8431, + "step": 2194 + }, + { + "epoch": 1.1155580966901721, + "grad_norm": 3.145125150680542, + "learning_rate": 9.2023796077413e-06, + "loss": 0.8641, + "step": 2195 + }, + { + "epoch": 1.1160663236134933, + "grad_norm": 3.1989402770996094, + "learning_rate": 9.20146838415108e-06, + "loss": 0.8556, + "step": 2196 + }, + { + "epoch": 1.1165745505368148, + "grad_norm": 3.063964605331421, + "learning_rate": 9.20055668552309e-06, + "loss": 0.9002, + "step": 2197 + }, + { + "epoch": 1.117082777460136, + "grad_norm": 3.030367374420166, + "learning_rate": 9.199644511960406e-06, + "loss": 0.8305, + "step": 2198 + }, + { + "epoch": 1.1175910043834572, + "grad_norm": 3.0812602043151855, + "learning_rate": 9.198731863566167e-06, + "loss": 0.7413, + "step": 2199 + }, + { + "epoch": 1.1180992313067786, + "grad_norm": 3.024437189102173, + "learning_rate": 9.197818740443557e-06, + "loss": 0.7769, + "step": 2200 + }, + { + "epoch": 1.1186074582300998, + "grad_norm": 3.1418869495391846, + "learning_rate": 9.196905142695824e-06, + "loss": 0.8448, + "step": 2201 + }, + { + "epoch": 1.119115685153421, + "grad_norm": 3.3266446590423584, + "learning_rate": 9.19599107042626e-06, + "loss": 0.8207, + "step": 2202 + }, + { + "epoch": 1.1196239120767422, + "grad_norm": 3.2680680751800537, + "learning_rate": 9.195076523738214e-06, + "loss": 0.7964, + "step": 2203 + }, + { + "epoch": 1.1201321390000636, + "grad_norm": 3.283367872238159, + "learning_rate": 9.19416150273509e-06, + "loss": 0.8387, + "step": 2204 + }, + { + "epoch": 1.1206403659233848, + "grad_norm": 3.3058741092681885, + "learning_rate": 9.193246007520344e-06, + "loss": 0.8465, + "step": 2205 + }, + { + "epoch": 1.121148592846706, + "grad_norm": 3.558431386947632, + "learning_rate": 9.192330038197487e-06, + "loss": 0.8973, + "step": 2206 + }, + { + "epoch": 1.1216568197700274, + "grad_norm": 3.1155524253845215, + "learning_rate": 9.191413594870082e-06, + "loss": 0.8167, + "step": 2207 + }, + { + "epoch": 1.1221650466933486, + "grad_norm": 3.192988157272339, + "learning_rate": 9.190496677641745e-06, + "loss": 0.8652, + "step": 2208 + }, + { + "epoch": 1.1226732736166698, + "grad_norm": 3.0044095516204834, + "learning_rate": 9.189579286616151e-06, + "loss": 0.7597, + "step": 2209 + }, + { + "epoch": 1.123181500539991, + "grad_norm": 3.117872953414917, + "learning_rate": 9.18866142189702e-06, + "loss": 0.8327, + "step": 2210 + }, + { + "epoch": 1.1236897274633124, + "grad_norm": 3.1604981422424316, + "learning_rate": 9.187743083588135e-06, + "loss": 0.8148, + "step": 2211 + }, + { + "epoch": 1.1241979543866336, + "grad_norm": 3.1135852336883545, + "learning_rate": 9.186824271793324e-06, + "loss": 0.837, + "step": 2212 + }, + { + "epoch": 1.1247061813099548, + "grad_norm": 3.106766939163208, + "learning_rate": 9.185904986616471e-06, + "loss": 0.8302, + "step": 2213 + }, + { + "epoch": 1.1252144082332762, + "grad_norm": 3.023362874984741, + "learning_rate": 9.184985228161518e-06, + "loss": 0.89, + "step": 2214 + }, + { + "epoch": 1.1257226351565974, + "grad_norm": 3.0963006019592285, + "learning_rate": 9.184064996532457e-06, + "loss": 0.8387, + "step": 2215 + }, + { + "epoch": 1.1262308620799186, + "grad_norm": 3.141411542892456, + "learning_rate": 9.183144291833332e-06, + "loss": 0.8162, + "step": 2216 + }, + { + "epoch": 1.12673908900324, + "grad_norm": 3.1030666828155518, + "learning_rate": 9.182223114168243e-06, + "loss": 0.8868, + "step": 2217 + }, + { + "epoch": 1.1272473159265612, + "grad_norm": 3.0338220596313477, + "learning_rate": 9.181301463641343e-06, + "loss": 0.8492, + "step": 2218 + }, + { + "epoch": 1.1277555428498824, + "grad_norm": 3.1174585819244385, + "learning_rate": 9.180379340356837e-06, + "loss": 0.892, + "step": 2219 + }, + { + "epoch": 1.1282637697732036, + "grad_norm": 3.2138559818267822, + "learning_rate": 9.179456744418987e-06, + "loss": 0.849, + "step": 2220 + }, + { + "epoch": 1.128771996696525, + "grad_norm": 2.9782936573028564, + "learning_rate": 9.178533675932103e-06, + "loss": 0.7515, + "step": 2221 + }, + { + "epoch": 1.1292802236198463, + "grad_norm": 3.7740142345428467, + "learning_rate": 9.177610135000552e-06, + "loss": 0.7538, + "step": 2222 + }, + { + "epoch": 1.1297884505431675, + "grad_norm": 3.475064516067505, + "learning_rate": 9.176686121728755e-06, + "loss": 0.884, + "step": 2223 + }, + { + "epoch": 1.1302966774664889, + "grad_norm": 3.4748387336730957, + "learning_rate": 9.175761636221186e-06, + "loss": 0.8535, + "step": 2224 + }, + { + "epoch": 1.13080490438981, + "grad_norm": 3.3585240840911865, + "learning_rate": 9.17483667858237e-06, + "loss": 0.8299, + "step": 2225 + }, + { + "epoch": 1.1313131313131313, + "grad_norm": 2.91369891166687, + "learning_rate": 9.173911248916888e-06, + "loss": 0.7635, + "step": 2226 + }, + { + "epoch": 1.1318213582364525, + "grad_norm": 3.1783607006073, + "learning_rate": 9.172985347329374e-06, + "loss": 0.8534, + "step": 2227 + }, + { + "epoch": 1.1323295851597739, + "grad_norm": 3.3611485958099365, + "learning_rate": 9.172058973924514e-06, + "loss": 0.9793, + "step": 2228 + }, + { + "epoch": 1.132837812083095, + "grad_norm": 3.0700531005859375, + "learning_rate": 9.171132128807047e-06, + "loss": 0.8908, + "step": 2229 + }, + { + "epoch": 1.1333460390064163, + "grad_norm": 3.0375781059265137, + "learning_rate": 9.170204812081767e-06, + "loss": 0.8368, + "step": 2230 + }, + { + "epoch": 1.1338542659297377, + "grad_norm": 2.99582576751709, + "learning_rate": 9.169277023853523e-06, + "loss": 0.7991, + "step": 2231 + }, + { + "epoch": 1.134362492853059, + "grad_norm": 3.3543779850006104, + "learning_rate": 9.168348764227213e-06, + "loss": 0.9089, + "step": 2232 + }, + { + "epoch": 1.13487071977638, + "grad_norm": 2.9977941513061523, + "learning_rate": 9.16742003330779e-06, + "loss": 0.8454, + "step": 2233 + }, + { + "epoch": 1.1353789466997015, + "grad_norm": 2.8905301094055176, + "learning_rate": 9.166490831200264e-06, + "loss": 0.7581, + "step": 2234 + }, + { + "epoch": 1.1358871736230227, + "grad_norm": 3.1561331748962402, + "learning_rate": 9.165561158009689e-06, + "loss": 0.8404, + "step": 2235 + }, + { + "epoch": 1.136395400546344, + "grad_norm": 3.356651544570923, + "learning_rate": 9.164631013841184e-06, + "loss": 0.929, + "step": 2236 + }, + { + "epoch": 1.1369036274696651, + "grad_norm": 2.907170534133911, + "learning_rate": 9.163700398799913e-06, + "loss": 0.8456, + "step": 2237 + }, + { + "epoch": 1.1374118543929865, + "grad_norm": 3.214137077331543, + "learning_rate": 9.162769312991095e-06, + "loss": 0.7972, + "step": 2238 + }, + { + "epoch": 1.1379200813163077, + "grad_norm": 2.9030961990356445, + "learning_rate": 9.161837756520005e-06, + "loss": 0.8041, + "step": 2239 + }, + { + "epoch": 1.138428308239629, + "grad_norm": 3.315462112426758, + "learning_rate": 9.160905729491967e-06, + "loss": 0.8011, + "step": 2240 + }, + { + "epoch": 1.1389365351629503, + "grad_norm": 3.185739278793335, + "learning_rate": 9.159973232012363e-06, + "loss": 0.8687, + "step": 2241 + }, + { + "epoch": 1.1394447620862715, + "grad_norm": 3.2211828231811523, + "learning_rate": 9.159040264186621e-06, + "loss": 0.8402, + "step": 2242 + }, + { + "epoch": 1.1399529890095927, + "grad_norm": 3.1946299076080322, + "learning_rate": 9.158106826120232e-06, + "loss": 0.8323, + "step": 2243 + }, + { + "epoch": 1.140461215932914, + "grad_norm": 2.910707712173462, + "learning_rate": 9.157172917918732e-06, + "loss": 0.8432, + "step": 2244 + }, + { + "epoch": 1.1409694428562354, + "grad_norm": 3.3521809577941895, + "learning_rate": 9.156238539687713e-06, + "loss": 0.8958, + "step": 2245 + }, + { + "epoch": 1.1414776697795566, + "grad_norm": 2.8933801651000977, + "learning_rate": 9.155303691532821e-06, + "loss": 0.777, + "step": 2246 + }, + { + "epoch": 1.1419858967028778, + "grad_norm": 3.164515256881714, + "learning_rate": 9.154368373559754e-06, + "loss": 0.8503, + "step": 2247 + }, + { + "epoch": 1.1424941236261992, + "grad_norm": 2.9174115657806396, + "learning_rate": 9.153432585874265e-06, + "loss": 0.7781, + "step": 2248 + }, + { + "epoch": 1.1430023505495204, + "grad_norm": 3.1479575634002686, + "learning_rate": 9.152496328582156e-06, + "loss": 0.9578, + "step": 2249 + }, + { + "epoch": 1.1435105774728416, + "grad_norm": 3.2180874347686768, + "learning_rate": 9.151559601789286e-06, + "loss": 0.7281, + "step": 2250 + }, + { + "epoch": 1.144018804396163, + "grad_norm": 2.899796724319458, + "learning_rate": 9.150622405601564e-06, + "loss": 0.7567, + "step": 2251 + }, + { + "epoch": 1.1445270313194842, + "grad_norm": 3.3812904357910156, + "learning_rate": 9.149684740124958e-06, + "loss": 0.8009, + "step": 2252 + }, + { + "epoch": 1.1450352582428054, + "grad_norm": 3.2274460792541504, + "learning_rate": 9.14874660546548e-06, + "loss": 0.9155, + "step": 2253 + }, + { + "epoch": 1.1455434851661266, + "grad_norm": 3.4081389904022217, + "learning_rate": 9.147808001729203e-06, + "loss": 0.8662, + "step": 2254 + }, + { + "epoch": 1.146051712089448, + "grad_norm": 3.192394256591797, + "learning_rate": 9.14686892902225e-06, + "loss": 0.872, + "step": 2255 + }, + { + "epoch": 1.1465599390127692, + "grad_norm": 3.7580795288085938, + "learning_rate": 9.145929387450794e-06, + "loss": 0.9428, + "step": 2256 + }, + { + "epoch": 1.1470681659360904, + "grad_norm": 2.902574300765991, + "learning_rate": 9.144989377121067e-06, + "loss": 0.7778, + "step": 2257 + }, + { + "epoch": 1.1475763928594118, + "grad_norm": 3.1599409580230713, + "learning_rate": 9.14404889813935e-06, + "loss": 0.909, + "step": 2258 + }, + { + "epoch": 1.148084619782733, + "grad_norm": 3.0382742881774902, + "learning_rate": 9.143107950611978e-06, + "loss": 0.788, + "step": 2259 + }, + { + "epoch": 1.1485928467060542, + "grad_norm": 3.310295343399048, + "learning_rate": 9.14216653464534e-06, + "loss": 0.8701, + "step": 2260 + }, + { + "epoch": 1.1491010736293754, + "grad_norm": 3.244692325592041, + "learning_rate": 9.141224650345875e-06, + "loss": 0.8442, + "step": 2261 + }, + { + "epoch": 1.1496093005526968, + "grad_norm": 3.261472463607788, + "learning_rate": 9.140282297820078e-06, + "loss": 0.8507, + "step": 2262 + }, + { + "epoch": 1.150117527476018, + "grad_norm": 3.2070884704589844, + "learning_rate": 9.139339477174495e-06, + "loss": 0.8635, + "step": 2263 + }, + { + "epoch": 1.1506257543993392, + "grad_norm": 3.273611307144165, + "learning_rate": 9.138396188515725e-06, + "loss": 0.8498, + "step": 2264 + }, + { + "epoch": 1.1511339813226606, + "grad_norm": 3.6329290866851807, + "learning_rate": 9.137452431950424e-06, + "loss": 0.9368, + "step": 2265 + }, + { + "epoch": 1.1516422082459818, + "grad_norm": 3.0486176013946533, + "learning_rate": 9.136508207585295e-06, + "loss": 0.8328, + "step": 2266 + }, + { + "epoch": 1.152150435169303, + "grad_norm": 3.372185468673706, + "learning_rate": 9.135563515527098e-06, + "loss": 0.8505, + "step": 2267 + }, + { + "epoch": 1.1526586620926245, + "grad_norm": 3.2860240936279297, + "learning_rate": 9.134618355882641e-06, + "loss": 0.867, + "step": 2268 + }, + { + "epoch": 1.1531668890159457, + "grad_norm": 3.219965934753418, + "learning_rate": 9.133672728758791e-06, + "loss": 0.8907, + "step": 2269 + }, + { + "epoch": 1.1536751159392669, + "grad_norm": 3.027545928955078, + "learning_rate": 9.132726634262465e-06, + "loss": 0.856, + "step": 2270 + }, + { + "epoch": 1.154183342862588, + "grad_norm": 3.089707851409912, + "learning_rate": 9.131780072500633e-06, + "loss": 0.9343, + "step": 2271 + }, + { + "epoch": 1.1546915697859095, + "grad_norm": 3.1712076663970947, + "learning_rate": 9.130833043580315e-06, + "loss": 0.8669, + "step": 2272 + }, + { + "epoch": 1.1551997967092307, + "grad_norm": 2.896791458129883, + "learning_rate": 9.12988554760859e-06, + "loss": 0.7617, + "step": 2273 + }, + { + "epoch": 1.1557080236325519, + "grad_norm": 3.4459807872772217, + "learning_rate": 9.128937584692586e-06, + "loss": 0.8495, + "step": 2274 + }, + { + "epoch": 1.1562162505558733, + "grad_norm": 2.8953559398651123, + "learning_rate": 9.127989154939481e-06, + "loss": 0.834, + "step": 2275 + }, + { + "epoch": 1.1567244774791945, + "grad_norm": 3.0459115505218506, + "learning_rate": 9.127040258456512e-06, + "loss": 0.8592, + "step": 2276 + }, + { + "epoch": 1.1572327044025157, + "grad_norm": 2.9910728931427, + "learning_rate": 9.126090895350966e-06, + "loss": 0.8281, + "step": 2277 + }, + { + "epoch": 1.1577409313258369, + "grad_norm": 3.0232229232788086, + "learning_rate": 9.125141065730179e-06, + "loss": 0.868, + "step": 2278 + }, + { + "epoch": 1.1582491582491583, + "grad_norm": 4.885484218597412, + "learning_rate": 9.124190769701547e-06, + "loss": 0.8484, + "step": 2279 + }, + { + "epoch": 1.1587573851724795, + "grad_norm": 3.1473946571350098, + "learning_rate": 9.123240007372514e-06, + "loss": 0.9519, + "step": 2280 + }, + { + "epoch": 1.1592656120958007, + "grad_norm": 3.1233749389648438, + "learning_rate": 9.122288778850576e-06, + "loss": 0.748, + "step": 2281 + }, + { + "epoch": 1.1597738390191221, + "grad_norm": 3.5578534603118896, + "learning_rate": 9.121337084243284e-06, + "loss": 0.8351, + "step": 2282 + }, + { + "epoch": 1.1602820659424433, + "grad_norm": 3.0705373287200928, + "learning_rate": 9.120384923658242e-06, + "loss": 0.8245, + "step": 2283 + }, + { + "epoch": 1.1607902928657645, + "grad_norm": 3.356689453125, + "learning_rate": 9.119432297203104e-06, + "loss": 0.972, + "step": 2284 + }, + { + "epoch": 1.161298519789086, + "grad_norm": 3.0365214347839355, + "learning_rate": 9.118479204985582e-06, + "loss": 0.924, + "step": 2285 + }, + { + "epoch": 1.1618067467124071, + "grad_norm": 9.004101753234863, + "learning_rate": 9.117525647113433e-06, + "loss": 0.7769, + "step": 2286 + }, + { + "epoch": 1.1623149736357283, + "grad_norm": 3.2367615699768066, + "learning_rate": 9.116571623694473e-06, + "loss": 0.7716, + "step": 2287 + }, + { + "epoch": 1.1628232005590495, + "grad_norm": 3.1672203540802, + "learning_rate": 9.115617134836567e-06, + "loss": 0.7419, + "step": 2288 + }, + { + "epoch": 1.163331427482371, + "grad_norm": 3.166799545288086, + "learning_rate": 9.114662180647635e-06, + "loss": 0.7803, + "step": 2289 + }, + { + "epoch": 1.1638396544056921, + "grad_norm": 7.431529521942139, + "learning_rate": 9.11370676123565e-06, + "loss": 0.8892, + "step": 2290 + }, + { + "epoch": 1.1643478813290133, + "grad_norm": 3.1311194896698, + "learning_rate": 9.112750876708633e-06, + "loss": 0.8267, + "step": 2291 + }, + { + "epoch": 1.1648561082523345, + "grad_norm": 3.522717237472534, + "learning_rate": 9.111794527174665e-06, + "loss": 0.9574, + "step": 2292 + }, + { + "epoch": 1.165364335175656, + "grad_norm": 3.246248245239258, + "learning_rate": 9.110837712741871e-06, + "loss": 0.8789, + "step": 2293 + }, + { + "epoch": 1.1658725620989772, + "grad_norm": 3.2041945457458496, + "learning_rate": 9.109880433518434e-06, + "loss": 0.8074, + "step": 2294 + }, + { + "epoch": 1.1663807890222984, + "grad_norm": 3.2885286808013916, + "learning_rate": 9.10892268961259e-06, + "loss": 0.9215, + "step": 2295 + }, + { + "epoch": 1.1668890159456198, + "grad_norm": 2.9827210903167725, + "learning_rate": 9.107964481132625e-06, + "loss": 0.8479, + "step": 2296 + }, + { + "epoch": 1.167397242868941, + "grad_norm": 3.529890298843384, + "learning_rate": 9.10700580818688e-06, + "loss": 0.9017, + "step": 2297 + }, + { + "epoch": 1.1679054697922622, + "grad_norm": 3.1567318439483643, + "learning_rate": 9.106046670883745e-06, + "loss": 0.8741, + "step": 2298 + }, + { + "epoch": 1.1684136967155836, + "grad_norm": 2.972628116607666, + "learning_rate": 9.105087069331666e-06, + "loss": 0.7806, + "step": 2299 + }, + { + "epoch": 1.1689219236389048, + "grad_norm": 3.2747461795806885, + "learning_rate": 9.104127003639138e-06, + "loss": 0.8251, + "step": 2300 + }, + { + "epoch": 1.169430150562226, + "grad_norm": 3.3266758918762207, + "learning_rate": 9.103166473914714e-06, + "loss": 0.8261, + "step": 2301 + }, + { + "epoch": 1.1699383774855474, + "grad_norm": 3.083332061767578, + "learning_rate": 9.102205480266993e-06, + "loss": 0.8373, + "step": 2302 + }, + { + "epoch": 1.1704466044088686, + "grad_norm": 2.9103949069976807, + "learning_rate": 9.101244022804631e-06, + "loss": 0.7487, + "step": 2303 + }, + { + "epoch": 1.1709548313321898, + "grad_norm": 3.736177444458008, + "learning_rate": 9.100282101636334e-06, + "loss": 0.868, + "step": 2304 + }, + { + "epoch": 1.171463058255511, + "grad_norm": 2.9956789016723633, + "learning_rate": 9.099319716870863e-06, + "loss": 0.7916, + "step": 2305 + }, + { + "epoch": 1.1719712851788324, + "grad_norm": 2.974546194076538, + "learning_rate": 9.098356868617028e-06, + "loss": 0.8415, + "step": 2306 + }, + { + "epoch": 1.1724795121021536, + "grad_norm": 4.560594081878662, + "learning_rate": 9.097393556983694e-06, + "loss": 0.9999, + "step": 2307 + }, + { + "epoch": 1.1729877390254748, + "grad_norm": 3.2358243465423584, + "learning_rate": 9.096429782079777e-06, + "loss": 0.7266, + "step": 2308 + }, + { + "epoch": 1.173495965948796, + "grad_norm": 3.0477235317230225, + "learning_rate": 9.095465544014244e-06, + "loss": 0.9312, + "step": 2309 + }, + { + "epoch": 1.1740041928721174, + "grad_norm": 3.2846338748931885, + "learning_rate": 9.09450084289612e-06, + "loss": 0.8764, + "step": 2310 + }, + { + "epoch": 1.1745124197954386, + "grad_norm": 3.05623459815979, + "learning_rate": 9.093535678834479e-06, + "loss": 0.7985, + "step": 2311 + }, + { + "epoch": 1.1750206467187598, + "grad_norm": 2.8562092781066895, + "learning_rate": 9.092570051938444e-06, + "loss": 0.8054, + "step": 2312 + }, + { + "epoch": 1.1755288736420813, + "grad_norm": 3.0281214714050293, + "learning_rate": 9.091603962317192e-06, + "loss": 0.858, + "step": 2313 + }, + { + "epoch": 1.1760371005654024, + "grad_norm": 3.491211414337158, + "learning_rate": 9.090637410079958e-06, + "loss": 0.8533, + "step": 2314 + }, + { + "epoch": 1.1765453274887236, + "grad_norm": 3.175933599472046, + "learning_rate": 9.089670395336023e-06, + "loss": 0.7493, + "step": 2315 + }, + { + "epoch": 1.177053554412045, + "grad_norm": 3.269052505493164, + "learning_rate": 9.088702918194723e-06, + "loss": 0.7981, + "step": 2316 + }, + { + "epoch": 1.1775617813353663, + "grad_norm": 3.173762559890747, + "learning_rate": 9.087734978765443e-06, + "loss": 0.7655, + "step": 2317 + }, + { + "epoch": 1.1780700082586875, + "grad_norm": 3.22505521774292, + "learning_rate": 9.086766577157626e-06, + "loss": 0.8203, + "step": 2318 + }, + { + "epoch": 1.1785782351820089, + "grad_norm": 3.346877336502075, + "learning_rate": 9.085797713480763e-06, + "loss": 0.8404, + "step": 2319 + }, + { + "epoch": 1.17908646210533, + "grad_norm": 3.098677396774292, + "learning_rate": 9.084828387844396e-06, + "loss": 0.8589, + "step": 2320 + }, + { + "epoch": 1.1795946890286513, + "grad_norm": 3.0070483684539795, + "learning_rate": 9.083858600358125e-06, + "loss": 0.8285, + "step": 2321 + }, + { + "epoch": 1.1801029159519725, + "grad_norm": 3.2013142108917236, + "learning_rate": 9.082888351131596e-06, + "loss": 0.7647, + "step": 2322 + }, + { + "epoch": 1.180611142875294, + "grad_norm": 2.795560359954834, + "learning_rate": 9.08191764027451e-06, + "loss": 0.7127, + "step": 2323 + }, + { + "epoch": 1.181119369798615, + "grad_norm": 2.8931901454925537, + "learning_rate": 9.080946467896623e-06, + "loss": 0.7877, + "step": 2324 + }, + { + "epoch": 1.1816275967219363, + "grad_norm": 3.125441551208496, + "learning_rate": 9.07997483410774e-06, + "loss": 0.7735, + "step": 2325 + }, + { + "epoch": 1.1821358236452575, + "grad_norm": 3.3146045207977295, + "learning_rate": 9.079002739017713e-06, + "loss": 0.8159, + "step": 2326 + }, + { + "epoch": 1.182644050568579, + "grad_norm": 3.1576666831970215, + "learning_rate": 9.078030182736458e-06, + "loss": 0.8076, + "step": 2327 + }, + { + "epoch": 1.1831522774919, + "grad_norm": 3.0713062286376953, + "learning_rate": 9.077057165373932e-06, + "loss": 0.7745, + "step": 2328 + }, + { + "epoch": 1.1836605044152213, + "grad_norm": 3.206789255142212, + "learning_rate": 9.076083687040154e-06, + "loss": 0.7932, + "step": 2329 + }, + { + "epoch": 1.1841687313385427, + "grad_norm": 3.2853028774261475, + "learning_rate": 9.075109747845188e-06, + "loss": 0.8669, + "step": 2330 + }, + { + "epoch": 1.184676958261864, + "grad_norm": 3.4324445724487305, + "learning_rate": 9.07413534789915e-06, + "loss": 0.833, + "step": 2331 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 3.3072774410247803, + "learning_rate": 9.073160487312212e-06, + "loss": 0.9215, + "step": 2332 + }, + { + "epoch": 1.1856934121085065, + "grad_norm": 3.1970913410186768, + "learning_rate": 9.072185166194595e-06, + "loss": 0.8354, + "step": 2333 + }, + { + "epoch": 1.1862016390318277, + "grad_norm": 3.035238027572632, + "learning_rate": 9.071209384656576e-06, + "loss": 0.8417, + "step": 2334 + }, + { + "epoch": 1.186709865955149, + "grad_norm": 3.103426694869995, + "learning_rate": 9.070233142808478e-06, + "loss": 0.7325, + "step": 2335 + }, + { + "epoch": 1.1872180928784704, + "grad_norm": 2.9447386264801025, + "learning_rate": 9.069256440760683e-06, + "loss": 0.9334, + "step": 2336 + }, + { + "epoch": 1.1877263198017916, + "grad_norm": 3.1110117435455322, + "learning_rate": 9.06827927862362e-06, + "loss": 0.8528, + "step": 2337 + }, + { + "epoch": 1.1882345467251128, + "grad_norm": 2.9499640464782715, + "learning_rate": 9.06730165650777e-06, + "loss": 0.8304, + "step": 2338 + }, + { + "epoch": 1.188742773648434, + "grad_norm": 3.1496939659118652, + "learning_rate": 9.06632357452367e-06, + "loss": 0.9043, + "step": 2339 + }, + { + "epoch": 1.1892510005717554, + "grad_norm": 3.112644672393799, + "learning_rate": 9.065345032781906e-06, + "loss": 0.754, + "step": 2340 + }, + { + "epoch": 1.1897592274950766, + "grad_norm": 3.083366632461548, + "learning_rate": 9.064366031393114e-06, + "loss": 0.788, + "step": 2341 + }, + { + "epoch": 1.1902674544183978, + "grad_norm": 3.426002025604248, + "learning_rate": 9.06338657046799e-06, + "loss": 0.8761, + "step": 2342 + }, + { + "epoch": 1.190775681341719, + "grad_norm": 3.2204999923706055, + "learning_rate": 9.06240665011727e-06, + "loss": 0.9046, + "step": 2343 + }, + { + "epoch": 1.1912839082650404, + "grad_norm": 3.1768798828125, + "learning_rate": 9.061426270451752e-06, + "loss": 0.8119, + "step": 2344 + }, + { + "epoch": 1.1917921351883616, + "grad_norm": 3.0866265296936035, + "learning_rate": 9.060445431582282e-06, + "loss": 0.7913, + "step": 2345 + }, + { + "epoch": 1.1923003621116828, + "grad_norm": 3.2694597244262695, + "learning_rate": 9.05946413361976e-06, + "loss": 0.871, + "step": 2346 + }, + { + "epoch": 1.1928085890350042, + "grad_norm": 3.0038201808929443, + "learning_rate": 9.058482376675132e-06, + "loss": 0.8324, + "step": 2347 + }, + { + "epoch": 1.1933168159583254, + "grad_norm": 3.2698614597320557, + "learning_rate": 9.057500160859405e-06, + "loss": 0.9151, + "step": 2348 + }, + { + "epoch": 1.1938250428816466, + "grad_norm": 3.040255546569824, + "learning_rate": 9.056517486283626e-06, + "loss": 0.7836, + "step": 2349 + }, + { + "epoch": 1.194333269804968, + "grad_norm": 3.324594736099243, + "learning_rate": 9.055534353058907e-06, + "loss": 0.8665, + "step": 2350 + }, + { + "epoch": 1.1948414967282892, + "grad_norm": 2.856466293334961, + "learning_rate": 9.054550761296404e-06, + "loss": 0.761, + "step": 2351 + }, + { + "epoch": 1.1953497236516104, + "grad_norm": 3.116780996322632, + "learning_rate": 9.053566711107327e-06, + "loss": 0.8185, + "step": 2352 + }, + { + "epoch": 1.1958579505749318, + "grad_norm": 3.159078359603882, + "learning_rate": 9.052582202602935e-06, + "loss": 0.8617, + "step": 2353 + }, + { + "epoch": 1.196366177498253, + "grad_norm": 3.0667052268981934, + "learning_rate": 9.051597235894544e-06, + "loss": 0.8621, + "step": 2354 + }, + { + "epoch": 1.1968744044215742, + "grad_norm": 3.0409858226776123, + "learning_rate": 9.050611811093517e-06, + "loss": 0.8067, + "step": 2355 + }, + { + "epoch": 1.1973826313448954, + "grad_norm": 3.0293259620666504, + "learning_rate": 9.049625928311272e-06, + "loss": 0.7851, + "step": 2356 + }, + { + "epoch": 1.1978908582682168, + "grad_norm": 3.1196017265319824, + "learning_rate": 9.048639587659275e-06, + "loss": 0.88, + "step": 2357 + }, + { + "epoch": 1.198399085191538, + "grad_norm": 3.0092966556549072, + "learning_rate": 9.04765278924905e-06, + "loss": 0.8317, + "step": 2358 + }, + { + "epoch": 1.1989073121148592, + "grad_norm": 3.1430952548980713, + "learning_rate": 9.046665533192167e-06, + "loss": 0.7821, + "step": 2359 + }, + { + "epoch": 1.1994155390381804, + "grad_norm": 3.2352135181427, + "learning_rate": 9.04567781960025e-06, + "loss": 0.8412, + "step": 2360 + }, + { + "epoch": 1.1999237659615019, + "grad_norm": 3.115145206451416, + "learning_rate": 9.044689648584974e-06, + "loss": 0.784, + "step": 2361 + }, + { + "epoch": 1.200431992884823, + "grad_norm": 2.9744040966033936, + "learning_rate": 9.043701020258067e-06, + "loss": 0.8497, + "step": 2362 + }, + { + "epoch": 1.2009402198081442, + "grad_norm": 3.1320130825042725, + "learning_rate": 9.042711934731309e-06, + "loss": 0.8199, + "step": 2363 + }, + { + "epoch": 1.2014484467314657, + "grad_norm": 2.8396992683410645, + "learning_rate": 9.041722392116529e-06, + "loss": 0.7548, + "step": 2364 + }, + { + "epoch": 1.2019566736547869, + "grad_norm": 3.1364777088165283, + "learning_rate": 9.04073239252561e-06, + "loss": 0.7983, + "step": 2365 + }, + { + "epoch": 1.202464900578108, + "grad_norm": 3.106210708618164, + "learning_rate": 9.039741936070487e-06, + "loss": 0.8722, + "step": 2366 + }, + { + "epoch": 1.2029731275014295, + "grad_norm": 2.981907367706299, + "learning_rate": 9.038751022863144e-06, + "loss": 0.821, + "step": 2367 + }, + { + "epoch": 1.2034813544247507, + "grad_norm": 3.6937308311462402, + "learning_rate": 9.037759653015619e-06, + "loss": 1.0072, + "step": 2368 + }, + { + "epoch": 1.2039895813480719, + "grad_norm": 2.9094719886779785, + "learning_rate": 9.03676782664e-06, + "loss": 0.8495, + "step": 2369 + }, + { + "epoch": 1.2044978082713933, + "grad_norm": 3.194845676422119, + "learning_rate": 9.035775543848428e-06, + "loss": 0.8678, + "step": 2370 + }, + { + "epoch": 1.2050060351947145, + "grad_norm": 3.114051580429077, + "learning_rate": 9.034782804753097e-06, + "loss": 0.8427, + "step": 2371 + }, + { + "epoch": 1.2055142621180357, + "grad_norm": 3.27559757232666, + "learning_rate": 9.033789609466248e-06, + "loss": 0.8815, + "step": 2372 + }, + { + "epoch": 1.206022489041357, + "grad_norm": 2.918750286102295, + "learning_rate": 9.032795958100179e-06, + "loss": 0.7836, + "step": 2373 + }, + { + "epoch": 1.2065307159646783, + "grad_norm": 3.2416598796844482, + "learning_rate": 9.031801850767234e-06, + "loss": 0.811, + "step": 2374 + }, + { + "epoch": 1.2070389428879995, + "grad_norm": 3.0644783973693848, + "learning_rate": 9.030807287579814e-06, + "loss": 0.836, + "step": 2375 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 3.6824216842651367, + "learning_rate": 9.02981226865037e-06, + "loss": 0.9161, + "step": 2376 + }, + { + "epoch": 1.208055396734642, + "grad_norm": 3.1358485221862793, + "learning_rate": 9.028816794091397e-06, + "loss": 0.8101, + "step": 2377 + }, + { + "epoch": 1.2085636236579633, + "grad_norm": 3.423971652984619, + "learning_rate": 9.027820864015455e-06, + "loss": 0.8777, + "step": 2378 + }, + { + "epoch": 1.2090718505812845, + "grad_norm": 3.24017333984375, + "learning_rate": 9.026824478535145e-06, + "loss": 0.8798, + "step": 2379 + }, + { + "epoch": 1.2095800775046057, + "grad_norm": 3.0433313846588135, + "learning_rate": 9.025827637763125e-06, + "loss": 0.8052, + "step": 2380 + }, + { + "epoch": 1.2100883044279271, + "grad_norm": 3.0827200412750244, + "learning_rate": 9.024830341812103e-06, + "loss": 0.8905, + "step": 2381 + }, + { + "epoch": 1.2105965313512483, + "grad_norm": 3.2809956073760986, + "learning_rate": 9.023832590794834e-06, + "loss": 0.8415, + "step": 2382 + }, + { + "epoch": 1.2111047582745695, + "grad_norm": 3.0780837535858154, + "learning_rate": 9.022834384824133e-06, + "loss": 0.853, + "step": 2383 + }, + { + "epoch": 1.211612985197891, + "grad_norm": 3.268043041229248, + "learning_rate": 9.021835724012858e-06, + "loss": 0.8751, + "step": 2384 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 3.1368472576141357, + "learning_rate": 9.020836608473926e-06, + "loss": 0.8292, + "step": 2385 + }, + { + "epoch": 1.2126294390445334, + "grad_norm": 2.958005905151367, + "learning_rate": 9.019837038320298e-06, + "loss": 0.8687, + "step": 2386 + }, + { + "epoch": 1.2131376659678546, + "grad_norm": 3.2961275577545166, + "learning_rate": 9.018837013664993e-06, + "loss": 0.7909, + "step": 2387 + }, + { + "epoch": 1.213645892891176, + "grad_norm": 3.2279088497161865, + "learning_rate": 9.017836534621078e-06, + "loss": 0.791, + "step": 2388 + }, + { + "epoch": 1.2141541198144972, + "grad_norm": 3.077115535736084, + "learning_rate": 9.01683560130167e-06, + "loss": 0.741, + "step": 2389 + }, + { + "epoch": 1.2146623467378184, + "grad_norm": 3.03611159324646, + "learning_rate": 9.015834213819941e-06, + "loss": 0.9399, + "step": 2390 + }, + { + "epoch": 1.2151705736611398, + "grad_norm": 3.0374908447265625, + "learning_rate": 9.014832372289113e-06, + "loss": 0.7597, + "step": 2391 + }, + { + "epoch": 1.215678800584461, + "grad_norm": 3.051901340484619, + "learning_rate": 9.013830076822457e-06, + "loss": 0.8795, + "step": 2392 + }, + { + "epoch": 1.2161870275077822, + "grad_norm": 3.266887664794922, + "learning_rate": 9.012827327533297e-06, + "loss": 0.92, + "step": 2393 + }, + { + "epoch": 1.2166952544311034, + "grad_norm": 3.2254724502563477, + "learning_rate": 9.011824124535012e-06, + "loss": 0.791, + "step": 2394 + }, + { + "epoch": 1.2172034813544248, + "grad_norm": 2.9573566913604736, + "learning_rate": 9.010820467941026e-06, + "loss": 0.8311, + "step": 2395 + }, + { + "epoch": 1.217711708277746, + "grad_norm": 3.0001730918884277, + "learning_rate": 9.009816357864819e-06, + "loss": 0.8513, + "step": 2396 + }, + { + "epoch": 1.2182199352010672, + "grad_norm": 3.096930503845215, + "learning_rate": 9.008811794419917e-06, + "loss": 0.8505, + "step": 2397 + }, + { + "epoch": 1.2187281621243886, + "grad_norm": 3.2050721645355225, + "learning_rate": 9.007806777719904e-06, + "loss": 0.805, + "step": 2398 + }, + { + "epoch": 1.2192363890477098, + "grad_norm": 3.0328614711761475, + "learning_rate": 9.00680130787841e-06, + "loss": 0.8385, + "step": 2399 + }, + { + "epoch": 1.219744615971031, + "grad_norm": 3.149296283721924, + "learning_rate": 9.00579538500912e-06, + "loss": 0.7824, + "step": 2400 + }, + { + "epoch": 1.2202528428943524, + "grad_norm": 3.3050854206085205, + "learning_rate": 9.004789009225766e-06, + "loss": 0.8228, + "step": 2401 + }, + { + "epoch": 1.2207610698176736, + "grad_norm": 3.378373384475708, + "learning_rate": 9.003782180642137e-06, + "loss": 0.839, + "step": 2402 + }, + { + "epoch": 1.2212692967409948, + "grad_norm": 3.153099298477173, + "learning_rate": 9.002774899372065e-06, + "loss": 0.7567, + "step": 2403 + }, + { + "epoch": 1.221777523664316, + "grad_norm": 2.9937663078308105, + "learning_rate": 9.001767165529442e-06, + "loss": 0.8638, + "step": 2404 + }, + { + "epoch": 1.2222857505876374, + "grad_norm": 3.197364568710327, + "learning_rate": 9.000758979228206e-06, + "loss": 0.8708, + "step": 2405 + }, + { + "epoch": 1.2227939775109586, + "grad_norm": 3.2125697135925293, + "learning_rate": 8.999750340582347e-06, + "loss": 0.8009, + "step": 2406 + }, + { + "epoch": 1.2233022044342798, + "grad_norm": 3.165888786315918, + "learning_rate": 8.998741249705905e-06, + "loss": 0.8278, + "step": 2407 + }, + { + "epoch": 1.2238104313576013, + "grad_norm": 3.570157766342163, + "learning_rate": 8.997731706712976e-06, + "loss": 0.7706, + "step": 2408 + }, + { + "epoch": 1.2243186582809225, + "grad_norm": 3.1924993991851807, + "learning_rate": 8.9967217117177e-06, + "loss": 0.7931, + "step": 2409 + }, + { + "epoch": 1.2248268852042437, + "grad_norm": 3.422243356704712, + "learning_rate": 8.995711264834274e-06, + "loss": 0.8448, + "step": 2410 + }, + { + "epoch": 1.2253351121275649, + "grad_norm": 3.4591121673583984, + "learning_rate": 8.994700366176945e-06, + "loss": 0.9026, + "step": 2411 + }, + { + "epoch": 1.2258433390508863, + "grad_norm": 3.981348752975464, + "learning_rate": 8.993689015860006e-06, + "loss": 0.8046, + "step": 2412 + }, + { + "epoch": 1.2263515659742075, + "grad_norm": 3.094794273376465, + "learning_rate": 8.992677213997809e-06, + "loss": 0.8496, + "step": 2413 + }, + { + "epoch": 1.2268597928975287, + "grad_norm": 3.073066234588623, + "learning_rate": 8.991664960704749e-06, + "loss": 0.8681, + "step": 2414 + }, + { + "epoch": 1.22736801982085, + "grad_norm": 3.075650930404663, + "learning_rate": 8.99065225609528e-06, + "loss": 0.8837, + "step": 2415 + }, + { + "epoch": 1.2278762467441713, + "grad_norm": 3.204456090927124, + "learning_rate": 8.989639100283903e-06, + "loss": 0.8398, + "step": 2416 + }, + { + "epoch": 1.2283844736674925, + "grad_norm": 2.9511778354644775, + "learning_rate": 8.988625493385166e-06, + "loss": 0.7308, + "step": 2417 + }, + { + "epoch": 1.228892700590814, + "grad_norm": 3.3825843334198, + "learning_rate": 8.987611435513677e-06, + "loss": 0.8433, + "step": 2418 + }, + { + "epoch": 1.229400927514135, + "grad_norm": 3.2403564453125, + "learning_rate": 8.986596926784088e-06, + "loss": 0.8387, + "step": 2419 + }, + { + "epoch": 1.2299091544374563, + "grad_norm": 3.0978314876556396, + "learning_rate": 8.985581967311103e-06, + "loss": 0.9133, + "step": 2420 + }, + { + "epoch": 1.2304173813607775, + "grad_norm": 3.2604808807373047, + "learning_rate": 8.984566557209481e-06, + "loss": 0.8242, + "step": 2421 + }, + { + "epoch": 1.230925608284099, + "grad_norm": 3.1556198596954346, + "learning_rate": 8.983550696594026e-06, + "loss": 0.8673, + "step": 2422 + }, + { + "epoch": 1.2314338352074201, + "grad_norm": 2.9530467987060547, + "learning_rate": 8.982534385579598e-06, + "loss": 0.8397, + "step": 2423 + }, + { + "epoch": 1.2319420621307413, + "grad_norm": 3.1731629371643066, + "learning_rate": 8.981517624281106e-06, + "loss": 0.8845, + "step": 2424 + }, + { + "epoch": 1.2324502890540627, + "grad_norm": 3.053375244140625, + "learning_rate": 8.980500412813506e-06, + "loss": 0.8773, + "step": 2425 + }, + { + "epoch": 1.232958515977384, + "grad_norm": 3.229344606399536, + "learning_rate": 8.979482751291816e-06, + "loss": 0.8718, + "step": 2426 + }, + { + "epoch": 1.2334667429007051, + "grad_norm": 3.266913414001465, + "learning_rate": 8.97846463983109e-06, + "loss": 0.9393, + "step": 2427 + }, + { + "epoch": 1.2339749698240263, + "grad_norm": 3.1056594848632812, + "learning_rate": 8.977446078546445e-06, + "loss": 0.7848, + "step": 2428 + }, + { + "epoch": 1.2344831967473477, + "grad_norm": 2.832486867904663, + "learning_rate": 8.976427067553044e-06, + "loss": 0.8953, + "step": 2429 + }, + { + "epoch": 1.234991423670669, + "grad_norm": 3.0973379611968994, + "learning_rate": 8.9754076069661e-06, + "loss": 0.8394, + "step": 2430 + }, + { + "epoch": 1.2354996505939901, + "grad_norm": 3.237048625946045, + "learning_rate": 8.97438769690088e-06, + "loss": 0.9313, + "step": 2431 + }, + { + "epoch": 1.2360078775173116, + "grad_norm": 3.141131639480591, + "learning_rate": 8.973367337472694e-06, + "loss": 0.7916, + "step": 2432 + }, + { + "epoch": 1.2365161044406328, + "grad_norm": 3.2379703521728516, + "learning_rate": 8.972346528796916e-06, + "loss": 0.8643, + "step": 2433 + }, + { + "epoch": 1.237024331363954, + "grad_norm": 3.0718865394592285, + "learning_rate": 8.97132527098896e-06, + "loss": 0.8541, + "step": 2434 + }, + { + "epoch": 1.2375325582872754, + "grad_norm": 3.0552730560302734, + "learning_rate": 8.970303564164293e-06, + "loss": 0.7842, + "step": 2435 + }, + { + "epoch": 1.2380407852105966, + "grad_norm": 3.286994457244873, + "learning_rate": 8.969281408438437e-06, + "loss": 0.8628, + "step": 2436 + }, + { + "epoch": 1.2385490121339178, + "grad_norm": 3.58115291595459, + "learning_rate": 8.96825880392696e-06, + "loss": 0.911, + "step": 2437 + }, + { + "epoch": 1.239057239057239, + "grad_norm": 3.199500799179077, + "learning_rate": 8.967235750745483e-06, + "loss": 0.864, + "step": 2438 + }, + { + "epoch": 1.2395654659805604, + "grad_norm": 3.2523953914642334, + "learning_rate": 8.966212249009675e-06, + "loss": 0.8658, + "step": 2439 + }, + { + "epoch": 1.2400736929038816, + "grad_norm": 3.1522932052612305, + "learning_rate": 8.96518829883526e-06, + "loss": 0.8349, + "step": 2440 + }, + { + "epoch": 1.2405819198272028, + "grad_norm": 3.049180030822754, + "learning_rate": 8.964163900338011e-06, + "loss": 0.8032, + "step": 2441 + }, + { + "epoch": 1.2410901467505242, + "grad_norm": 3.3346316814422607, + "learning_rate": 8.963139053633752e-06, + "loss": 0.8094, + "step": 2442 + }, + { + "epoch": 1.2415983736738454, + "grad_norm": 3.185328245162964, + "learning_rate": 8.962113758838356e-06, + "loss": 0.8299, + "step": 2443 + }, + { + "epoch": 1.2421066005971666, + "grad_norm": 2.983642101287842, + "learning_rate": 8.961088016067744e-06, + "loss": 0.8406, + "step": 2444 + }, + { + "epoch": 1.2426148275204878, + "grad_norm": 2.7408642768859863, + "learning_rate": 8.960061825437897e-06, + "loss": 0.682, + "step": 2445 + }, + { + "epoch": 1.2431230544438092, + "grad_norm": 3.0992236137390137, + "learning_rate": 8.95903518706484e-06, + "loss": 0.856, + "step": 2446 + }, + { + "epoch": 1.2436312813671304, + "grad_norm": 3.2850234508514404, + "learning_rate": 8.958008101064646e-06, + "loss": 0.8097, + "step": 2447 + }, + { + "epoch": 1.2441395082904516, + "grad_norm": 3.040407419204712, + "learning_rate": 8.956980567553443e-06, + "loss": 0.8335, + "step": 2448 + }, + { + "epoch": 1.244647735213773, + "grad_norm": 3.125934362411499, + "learning_rate": 8.955952586647414e-06, + "loss": 0.8421, + "step": 2449 + }, + { + "epoch": 1.2451559621370942, + "grad_norm": 3.215177536010742, + "learning_rate": 8.954924158462782e-06, + "loss": 0.8339, + "step": 2450 + }, + { + "epoch": 1.2456641890604154, + "grad_norm": 3.099355459213257, + "learning_rate": 8.953895283115825e-06, + "loss": 0.7777, + "step": 2451 + }, + { + "epoch": 1.2461724159837368, + "grad_norm": 2.988253116607666, + "learning_rate": 8.952865960722876e-06, + "loss": 0.8, + "step": 2452 + }, + { + "epoch": 1.246680642907058, + "grad_norm": 2.885324716567993, + "learning_rate": 8.951836191400316e-06, + "loss": 0.8199, + "step": 2453 + }, + { + "epoch": 1.2471888698303792, + "grad_norm": 3.369645357131958, + "learning_rate": 8.950805975264572e-06, + "loss": 0.8281, + "step": 2454 + }, + { + "epoch": 1.2476970967537004, + "grad_norm": 3.1595754623413086, + "learning_rate": 8.949775312432125e-06, + "loss": 0.8552, + "step": 2455 + }, + { + "epoch": 1.2482053236770219, + "grad_norm": 3.157674551010132, + "learning_rate": 8.94874420301951e-06, + "loss": 0.8398, + "step": 2456 + }, + { + "epoch": 1.248713550600343, + "grad_norm": 2.965175151824951, + "learning_rate": 8.947712647143308e-06, + "loss": 0.824, + "step": 2457 + }, + { + "epoch": 1.2492217775236643, + "grad_norm": 3.188775062561035, + "learning_rate": 8.946680644920148e-06, + "loss": 0.9177, + "step": 2458 + }, + { + "epoch": 1.2497300044469857, + "grad_norm": 3.110813856124878, + "learning_rate": 8.945648196466718e-06, + "loss": 0.8316, + "step": 2459 + }, + { + "epoch": 1.2502382313703069, + "grad_norm": 3.100200653076172, + "learning_rate": 8.944615301899749e-06, + "loss": 0.8408, + "step": 2460 + }, + { + "epoch": 1.250746458293628, + "grad_norm": 2.9803078174591064, + "learning_rate": 8.943581961336023e-06, + "loss": 0.8405, + "step": 2461 + }, + { + "epoch": 1.2512546852169493, + "grad_norm": 3.0053930282592773, + "learning_rate": 8.942548174892379e-06, + "loss": 0.8902, + "step": 2462 + }, + { + "epoch": 1.2517629121402707, + "grad_norm": 3.080328941345215, + "learning_rate": 8.941513942685698e-06, + "loss": 0.8324, + "step": 2463 + }, + { + "epoch": 1.2522711390635919, + "grad_norm": 3.199618101119995, + "learning_rate": 8.940479264832918e-06, + "loss": 0.787, + "step": 2464 + }, + { + "epoch": 1.252779365986913, + "grad_norm": 3.244206428527832, + "learning_rate": 8.93944414145102e-06, + "loss": 0.8252, + "step": 2465 + }, + { + "epoch": 1.2532875929102345, + "grad_norm": 3.08567476272583, + "learning_rate": 8.938408572657045e-06, + "loss": 0.8402, + "step": 2466 + }, + { + "epoch": 1.2537958198335557, + "grad_norm": 3.227609157562256, + "learning_rate": 8.937372558568078e-06, + "loss": 0.8494, + "step": 2467 + }, + { + "epoch": 1.254304046756877, + "grad_norm": 3.0492734909057617, + "learning_rate": 8.936336099301253e-06, + "loss": 0.9403, + "step": 2468 + }, + { + "epoch": 1.2548122736801983, + "grad_norm": 2.8660738468170166, + "learning_rate": 8.93529919497376e-06, + "loss": 0.7848, + "step": 2469 + }, + { + "epoch": 1.2553205006035195, + "grad_norm": 2.914168119430542, + "learning_rate": 8.934261845702835e-06, + "loss": 0.8184, + "step": 2470 + }, + { + "epoch": 1.2558287275268407, + "grad_norm": 3.180852174758911, + "learning_rate": 8.933224051605764e-06, + "loss": 0.85, + "step": 2471 + }, + { + "epoch": 1.2563369544501621, + "grad_norm": 3.4860987663269043, + "learning_rate": 8.932185812799888e-06, + "loss": 0.8416, + "step": 2472 + }, + { + "epoch": 1.2568451813734833, + "grad_norm": 3.155968427658081, + "learning_rate": 8.931147129402592e-06, + "loss": 0.8476, + "step": 2473 + }, + { + "epoch": 1.2573534082968045, + "grad_norm": 3.176732063293457, + "learning_rate": 8.930108001531318e-06, + "loss": 0.8863, + "step": 2474 + }, + { + "epoch": 1.2578616352201257, + "grad_norm": 3.208754301071167, + "learning_rate": 8.929068429303553e-06, + "loss": 0.8382, + "step": 2475 + }, + { + "epoch": 1.258369862143447, + "grad_norm": 3.254345655441284, + "learning_rate": 8.928028412836835e-06, + "loss": 0.8497, + "step": 2476 + }, + { + "epoch": 1.2588780890667683, + "grad_norm": 3.4679901599884033, + "learning_rate": 8.926987952248753e-06, + "loss": 0.8932, + "step": 2477 + }, + { + "epoch": 1.2593863159900895, + "grad_norm": 3.174726963043213, + "learning_rate": 8.925947047656949e-06, + "loss": 0.771, + "step": 2478 + }, + { + "epoch": 1.2598945429134107, + "grad_norm": 3.153735399246216, + "learning_rate": 8.92490569917911e-06, + "loss": 0.8844, + "step": 2479 + }, + { + "epoch": 1.2604027698367322, + "grad_norm": 3.165095329284668, + "learning_rate": 8.923863906932976e-06, + "loss": 0.781, + "step": 2480 + }, + { + "epoch": 1.2609109967600534, + "grad_norm": 3.09627628326416, + "learning_rate": 8.922821671036338e-06, + "loss": 0.8963, + "step": 2481 + }, + { + "epoch": 1.2614192236833746, + "grad_norm": 3.1823904514312744, + "learning_rate": 8.921778991607036e-06, + "loss": 0.8274, + "step": 2482 + }, + { + "epoch": 1.261927450606696, + "grad_norm": 3.225573778152466, + "learning_rate": 8.920735868762957e-06, + "loss": 0.8876, + "step": 2483 + }, + { + "epoch": 1.2624356775300172, + "grad_norm": 2.9334287643432617, + "learning_rate": 8.919692302622048e-06, + "loss": 0.7982, + "step": 2484 + }, + { + "epoch": 1.2629439044533384, + "grad_norm": 11.20725154876709, + "learning_rate": 8.918648293302293e-06, + "loss": 0.869, + "step": 2485 + }, + { + "epoch": 1.2634521313766598, + "grad_norm": 3.3023571968078613, + "learning_rate": 8.917603840921736e-06, + "loss": 0.8895, + "step": 2486 + }, + { + "epoch": 1.263960358299981, + "grad_norm": 2.789487838745117, + "learning_rate": 8.916558945598469e-06, + "loss": 0.8183, + "step": 2487 + }, + { + "epoch": 1.2644685852233022, + "grad_norm": 3.5704424381256104, + "learning_rate": 8.915513607450627e-06, + "loss": 0.9285, + "step": 2488 + }, + { + "epoch": 1.2649768121466234, + "grad_norm": 2.936912775039673, + "learning_rate": 8.914467826596408e-06, + "loss": 0.7793, + "step": 2489 + }, + { + "epoch": 1.2654850390699448, + "grad_norm": 3.02742338180542, + "learning_rate": 8.913421603154046e-06, + "loss": 0.8367, + "step": 2490 + }, + { + "epoch": 1.265993265993266, + "grad_norm": 3.056135416030884, + "learning_rate": 8.91237493724184e-06, + "loss": 0.8521, + "step": 2491 + }, + { + "epoch": 1.2665014929165872, + "grad_norm": 3.128657102584839, + "learning_rate": 8.911327828978123e-06, + "loss": 0.9025, + "step": 2492 + }, + { + "epoch": 1.2670097198399084, + "grad_norm": 2.892381191253662, + "learning_rate": 8.910280278481289e-06, + "loss": 0.7111, + "step": 2493 + }, + { + "epoch": 1.2675179467632298, + "grad_norm": 2.8076236248016357, + "learning_rate": 8.90923228586978e-06, + "loss": 0.8013, + "step": 2494 + }, + { + "epoch": 1.268026173686551, + "grad_norm": 3.046893835067749, + "learning_rate": 8.908183851262087e-06, + "loss": 0.8518, + "step": 2495 + }, + { + "epoch": 1.2685344006098722, + "grad_norm": 3.2953848838806152, + "learning_rate": 8.90713497477675e-06, + "loss": 0.8759, + "step": 2496 + }, + { + "epoch": 1.2690426275331936, + "grad_norm": 3.101726770401001, + "learning_rate": 8.906085656532361e-06, + "loss": 0.7667, + "step": 2497 + }, + { + "epoch": 1.2695508544565148, + "grad_norm": 3.3615872859954834, + "learning_rate": 8.905035896647561e-06, + "loss": 0.8447, + "step": 2498 + }, + { + "epoch": 1.270059081379836, + "grad_norm": 3.2602908611297607, + "learning_rate": 8.903985695241037e-06, + "loss": 0.8351, + "step": 2499 + }, + { + "epoch": 1.2705673083031574, + "grad_norm": 3.361398458480835, + "learning_rate": 8.902935052431534e-06, + "loss": 0.8394, + "step": 2500 + }, + { + "epoch": 1.2705673083031574, + "eval_loss": 1.2745423316955566, + "eval_runtime": 15.0612, + "eval_samples_per_second": 26.558, + "eval_steps_per_second": 3.32, + "step": 2500 + }, + { + "epoch": 1.2710755352264786, + "grad_norm": 3.303532838821411, + "learning_rate": 8.90188396833784e-06, + "loss": 0.9127, + "step": 2501 + }, + { + "epoch": 1.2715837621497998, + "grad_norm": 3.171142578125, + "learning_rate": 8.9008324430788e-06, + "loss": 0.8087, + "step": 2502 + }, + { + "epoch": 1.2720919890731213, + "grad_norm": 3.1894915103912354, + "learning_rate": 8.899780476773297e-06, + "loss": 0.9523, + "step": 2503 + }, + { + "epoch": 1.2726002159964425, + "grad_norm": 3.1396098136901855, + "learning_rate": 8.898728069540278e-06, + "loss": 0.8368, + "step": 2504 + }, + { + "epoch": 1.2731084429197637, + "grad_norm": 3.1784250736236572, + "learning_rate": 8.897675221498729e-06, + "loss": 0.7707, + "step": 2505 + }, + { + "epoch": 1.2736166698430849, + "grad_norm": 3.0713679790496826, + "learning_rate": 8.896621932767692e-06, + "loss": 0.8648, + "step": 2506 + }, + { + "epoch": 1.2741248967664063, + "grad_norm": 3.134429693222046, + "learning_rate": 8.895568203466256e-06, + "loss": 0.7814, + "step": 2507 + }, + { + "epoch": 1.2746331236897275, + "grad_norm": 3.5291848182678223, + "learning_rate": 8.894514033713562e-06, + "loss": 0.8768, + "step": 2508 + }, + { + "epoch": 1.2751413506130487, + "grad_norm": 3.3426871299743652, + "learning_rate": 8.893459423628797e-06, + "loss": 0.941, + "step": 2509 + }, + { + "epoch": 1.2756495775363699, + "grad_norm": 3.3209519386291504, + "learning_rate": 8.8924043733312e-06, + "loss": 0.9354, + "step": 2510 + }, + { + "epoch": 1.2761578044596913, + "grad_norm": 2.953981876373291, + "learning_rate": 8.891348882940063e-06, + "loss": 0.8667, + "step": 2511 + }, + { + "epoch": 1.2766660313830125, + "grad_norm": 3.2882747650146484, + "learning_rate": 8.890292952574723e-06, + "loss": 0.8203, + "step": 2512 + }, + { + "epoch": 1.2771742583063337, + "grad_norm": 3.161607027053833, + "learning_rate": 8.889236582354568e-06, + "loss": 0.8898, + "step": 2513 + }, + { + "epoch": 1.277682485229655, + "grad_norm": 3.209338426589966, + "learning_rate": 8.888179772399038e-06, + "loss": 0.8284, + "step": 2514 + }, + { + "epoch": 1.2781907121529763, + "grad_norm": 3.230221748352051, + "learning_rate": 8.887122522827617e-06, + "loss": 0.8283, + "step": 2515 + }, + { + "epoch": 1.2786989390762975, + "grad_norm": 3.2188124656677246, + "learning_rate": 8.886064833759847e-06, + "loss": 0.8498, + "step": 2516 + }, + { + "epoch": 1.279207165999619, + "grad_norm": 3.1550827026367188, + "learning_rate": 8.885006705315313e-06, + "loss": 0.8682, + "step": 2517 + }, + { + "epoch": 1.2797153929229401, + "grad_norm": 3.071791648864746, + "learning_rate": 8.883948137613651e-06, + "loss": 0.7674, + "step": 2518 + }, + { + "epoch": 1.2802236198462613, + "grad_norm": 2.99682354927063, + "learning_rate": 8.882889130774551e-06, + "loss": 0.8389, + "step": 2519 + }, + { + "epoch": 1.2807318467695827, + "grad_norm": 3.1506733894348145, + "learning_rate": 8.881829684917746e-06, + "loss": 0.8242, + "step": 2520 + }, + { + "epoch": 1.281240073692904, + "grad_norm": 2.9105701446533203, + "learning_rate": 8.880769800163025e-06, + "loss": 0.7714, + "step": 2521 + }, + { + "epoch": 1.2817483006162251, + "grad_norm": 3.228342294692993, + "learning_rate": 8.879709476630219e-06, + "loss": 0.8571, + "step": 2522 + }, + { + "epoch": 1.2822565275395463, + "grad_norm": 3.045037031173706, + "learning_rate": 8.878648714439217e-06, + "loss": 0.8537, + "step": 2523 + }, + { + "epoch": 1.2827647544628678, + "grad_norm": 3.146073579788208, + "learning_rate": 8.877587513709954e-06, + "loss": 0.8636, + "step": 2524 + }, + { + "epoch": 1.283272981386189, + "grad_norm": 3.209416627883911, + "learning_rate": 8.876525874562413e-06, + "loss": 0.8199, + "step": 2525 + }, + { + "epoch": 1.2837812083095101, + "grad_norm": 2.9850914478302, + "learning_rate": 8.875463797116627e-06, + "loss": 0.8699, + "step": 2526 + }, + { + "epoch": 1.2842894352328313, + "grad_norm": 3.307227849960327, + "learning_rate": 8.874401281492681e-06, + "loss": 0.8231, + "step": 2527 + }, + { + "epoch": 1.2847976621561528, + "grad_norm": 2.9989612102508545, + "learning_rate": 8.873338327810708e-06, + "loss": 0.787, + "step": 2528 + }, + { + "epoch": 1.285305889079474, + "grad_norm": 3.090996742248535, + "learning_rate": 8.872274936190888e-06, + "loss": 0.8456, + "step": 2529 + }, + { + "epoch": 1.2858141160027952, + "grad_norm": 3.1071884632110596, + "learning_rate": 8.871211106753457e-06, + "loss": 0.7524, + "step": 2530 + }, + { + "epoch": 1.2863223429261166, + "grad_norm": 3.232839822769165, + "learning_rate": 8.870146839618694e-06, + "loss": 0.8982, + "step": 2531 + }, + { + "epoch": 1.2868305698494378, + "grad_norm": 3.2980551719665527, + "learning_rate": 8.869082134906931e-06, + "loss": 0.8118, + "step": 2532 + }, + { + "epoch": 1.287338796772759, + "grad_norm": 3.268399715423584, + "learning_rate": 8.868016992738548e-06, + "loss": 0.803, + "step": 2533 + }, + { + "epoch": 1.2878470236960804, + "grad_norm": 3.322483539581299, + "learning_rate": 8.866951413233976e-06, + "loss": 0.9056, + "step": 2534 + }, + { + "epoch": 1.2883552506194016, + "grad_norm": 3.203437328338623, + "learning_rate": 8.865885396513693e-06, + "loss": 0.9368, + "step": 2535 + }, + { + "epoch": 1.2888634775427228, + "grad_norm": 2.9805757999420166, + "learning_rate": 8.864818942698228e-06, + "loss": 0.8216, + "step": 2536 + }, + { + "epoch": 1.2893717044660442, + "grad_norm": 2.8534796237945557, + "learning_rate": 8.86375205190816e-06, + "loss": 0.78, + "step": 2537 + }, + { + "epoch": 1.2898799313893654, + "grad_norm": 2.94832444190979, + "learning_rate": 8.862684724264118e-06, + "loss": 0.7969, + "step": 2538 + }, + { + "epoch": 1.2903881583126866, + "grad_norm": 2.9740326404571533, + "learning_rate": 8.861616959886774e-06, + "loss": 0.9288, + "step": 2539 + }, + { + "epoch": 1.2908963852360078, + "grad_norm": 3.0878005027770996, + "learning_rate": 8.86054875889686e-06, + "loss": 0.7948, + "step": 2540 + }, + { + "epoch": 1.291404612159329, + "grad_norm": 3.220125198364258, + "learning_rate": 8.859480121415152e-06, + "loss": 0.8302, + "step": 2541 + }, + { + "epoch": 1.2919128390826504, + "grad_norm": 3.5187385082244873, + "learning_rate": 8.85841104756247e-06, + "loss": 0.8091, + "step": 2542 + }, + { + "epoch": 1.2924210660059716, + "grad_norm": 3.397118330001831, + "learning_rate": 8.857341537459693e-06, + "loss": 0.8509, + "step": 2543 + }, + { + "epoch": 1.2929292929292928, + "grad_norm": 3.103379726409912, + "learning_rate": 8.856271591227743e-06, + "loss": 0.8122, + "step": 2544 + }, + { + "epoch": 1.2934375198526142, + "grad_norm": 3.081847667694092, + "learning_rate": 8.855201208987593e-06, + "loss": 0.9056, + "step": 2545 + }, + { + "epoch": 1.2939457467759354, + "grad_norm": 3.5324161052703857, + "learning_rate": 8.854130390860268e-06, + "loss": 0.8944, + "step": 2546 + }, + { + "epoch": 1.2944539736992566, + "grad_norm": 3.22245192527771, + "learning_rate": 8.853059136966835e-06, + "loss": 0.8114, + "step": 2547 + }, + { + "epoch": 1.294962200622578, + "grad_norm": 3.203016996383667, + "learning_rate": 8.851987447428419e-06, + "loss": 0.8688, + "step": 2548 + }, + { + "epoch": 1.2954704275458993, + "grad_norm": 3.2853200435638428, + "learning_rate": 8.850915322366187e-06, + "loss": 0.7993, + "step": 2549 + }, + { + "epoch": 1.2959786544692204, + "grad_norm": 2.8735644817352295, + "learning_rate": 8.849842761901363e-06, + "loss": 0.8585, + "step": 2550 + }, + { + "epoch": 1.2964868813925419, + "grad_norm": 3.0382649898529053, + "learning_rate": 8.848769766155212e-06, + "loss": 0.8293, + "step": 2551 + }, + { + "epoch": 1.296995108315863, + "grad_norm": 2.963172197341919, + "learning_rate": 8.847696335249055e-06, + "loss": 0.8423, + "step": 2552 + }, + { + "epoch": 1.2975033352391843, + "grad_norm": 3.24950909614563, + "learning_rate": 8.846622469304255e-06, + "loss": 0.7968, + "step": 2553 + }, + { + "epoch": 1.2980115621625057, + "grad_norm": 3.5385589599609375, + "learning_rate": 8.845548168442232e-06, + "loss": 0.9819, + "step": 2554 + }, + { + "epoch": 1.2985197890858269, + "grad_norm": 3.3161239624023438, + "learning_rate": 8.844473432784448e-06, + "loss": 0.8769, + "step": 2555 + }, + { + "epoch": 1.299028016009148, + "grad_norm": 3.361607074737549, + "learning_rate": 8.843398262452422e-06, + "loss": 0.873, + "step": 2556 + }, + { + "epoch": 1.2995362429324693, + "grad_norm": 2.9351627826690674, + "learning_rate": 8.842322657567715e-06, + "loss": 0.8579, + "step": 2557 + }, + { + "epoch": 1.3000444698557905, + "grad_norm": 2.9046859741210938, + "learning_rate": 8.84124661825194e-06, + "loss": 0.8712, + "step": 2558 + }, + { + "epoch": 1.300552696779112, + "grad_norm": 3.100588798522949, + "learning_rate": 8.840170144626761e-06, + "loss": 0.8623, + "step": 2559 + }, + { + "epoch": 1.301060923702433, + "grad_norm": 3.147078275680542, + "learning_rate": 8.839093236813887e-06, + "loss": 0.8377, + "step": 2560 + }, + { + "epoch": 1.3015691506257543, + "grad_norm": 3.067751884460449, + "learning_rate": 8.83801589493508e-06, + "loss": 0.8867, + "step": 2561 + }, + { + "epoch": 1.3020773775490757, + "grad_norm": 3.0106406211853027, + "learning_rate": 8.836938119112145e-06, + "loss": 0.8218, + "step": 2562 + }, + { + "epoch": 1.302585604472397, + "grad_norm": 2.999750852584839, + "learning_rate": 8.835859909466949e-06, + "loss": 0.8377, + "step": 2563 + }, + { + "epoch": 1.303093831395718, + "grad_norm": 3.097104072570801, + "learning_rate": 8.834781266121391e-06, + "loss": 0.7746, + "step": 2564 + }, + { + "epoch": 1.3036020583190395, + "grad_norm": 3.1769418716430664, + "learning_rate": 8.83370218919743e-06, + "loss": 0.937, + "step": 2565 + }, + { + "epoch": 1.3041102852423607, + "grad_norm": 2.8542466163635254, + "learning_rate": 8.832622678817074e-06, + "loss": 0.8561, + "step": 2566 + }, + { + "epoch": 1.304618512165682, + "grad_norm": 3.1751227378845215, + "learning_rate": 8.831542735102375e-06, + "loss": 0.791, + "step": 2567 + }, + { + "epoch": 1.3051267390890033, + "grad_norm": 3.0102860927581787, + "learning_rate": 8.830462358175438e-06, + "loss": 0.9021, + "step": 2568 + }, + { + "epoch": 1.3056349660123245, + "grad_norm": 3.0185563564300537, + "learning_rate": 8.829381548158414e-06, + "loss": 0.7755, + "step": 2569 + }, + { + "epoch": 1.3061431929356457, + "grad_norm": 2.9850900173187256, + "learning_rate": 8.828300305173506e-06, + "loss": 0.854, + "step": 2570 + }, + { + "epoch": 1.3066514198589672, + "grad_norm": 3.0586602687835693, + "learning_rate": 8.827218629342962e-06, + "loss": 0.7996, + "step": 2571 + }, + { + "epoch": 1.3071596467822884, + "grad_norm": 3.3865036964416504, + "learning_rate": 8.826136520789084e-06, + "loss": 0.7912, + "step": 2572 + }, + { + "epoch": 1.3076678737056096, + "grad_norm": 2.9162116050720215, + "learning_rate": 8.82505397963422e-06, + "loss": 0.8339, + "step": 2573 + }, + { + "epoch": 1.3081761006289307, + "grad_norm": 3.1088786125183105, + "learning_rate": 8.823971006000767e-06, + "loss": 0.9219, + "step": 2574 + }, + { + "epoch": 1.308684327552252, + "grad_norm": 3.166175365447998, + "learning_rate": 8.822887600011168e-06, + "loss": 0.9238, + "step": 2575 + }, + { + "epoch": 1.3091925544755734, + "grad_norm": 3.029024124145508, + "learning_rate": 8.821803761787923e-06, + "loss": 0.7947, + "step": 2576 + }, + { + "epoch": 1.3097007813988946, + "grad_norm": 3.238969087600708, + "learning_rate": 8.820719491453572e-06, + "loss": 0.9903, + "step": 2577 + }, + { + "epoch": 1.3102090083222158, + "grad_norm": 3.3764801025390625, + "learning_rate": 8.819634789130709e-06, + "loss": 0.9136, + "step": 2578 + }, + { + "epoch": 1.3107172352455372, + "grad_norm": 3.1779088973999023, + "learning_rate": 8.818549654941976e-06, + "loss": 0.8954, + "step": 2579 + }, + { + "epoch": 1.3112254621688584, + "grad_norm": 2.949017286300659, + "learning_rate": 8.817464089010064e-06, + "loss": 0.8774, + "step": 2580 + }, + { + "epoch": 1.3117336890921796, + "grad_norm": 3.089338541030884, + "learning_rate": 8.81637809145771e-06, + "loss": 0.7818, + "step": 2581 + }, + { + "epoch": 1.312241916015501, + "grad_norm": 3.3381898403167725, + "learning_rate": 8.815291662407704e-06, + "loss": 0.7747, + "step": 2582 + }, + { + "epoch": 1.3127501429388222, + "grad_norm": 3.0036628246307373, + "learning_rate": 8.814204801982882e-06, + "loss": 0.802, + "step": 2583 + }, + { + "epoch": 1.3132583698621434, + "grad_norm": 3.6632609367370605, + "learning_rate": 8.813117510306128e-06, + "loss": 0.796, + "step": 2584 + }, + { + "epoch": 1.3137665967854648, + "grad_norm": 3.659998893737793, + "learning_rate": 8.812029787500379e-06, + "loss": 0.8787, + "step": 2585 + }, + { + "epoch": 1.314274823708786, + "grad_norm": 3.202430248260498, + "learning_rate": 8.810941633688617e-06, + "loss": 0.8552, + "step": 2586 + }, + { + "epoch": 1.3147830506321072, + "grad_norm": 3.068216562271118, + "learning_rate": 8.809853048993873e-06, + "loss": 0.8298, + "step": 2587 + }, + { + "epoch": 1.3152912775554286, + "grad_norm": 3.2713656425476074, + "learning_rate": 8.80876403353923e-06, + "loss": 0.8764, + "step": 2588 + }, + { + "epoch": 1.3157995044787498, + "grad_norm": 3.147080898284912, + "learning_rate": 8.807674587447813e-06, + "loss": 0.846, + "step": 2589 + }, + { + "epoch": 1.316307731402071, + "grad_norm": 3.5714316368103027, + "learning_rate": 8.806584710842803e-06, + "loss": 0.9365, + "step": 2590 + }, + { + "epoch": 1.3168159583253922, + "grad_norm": 3.3361597061157227, + "learning_rate": 8.805494403847426e-06, + "loss": 0.7961, + "step": 2591 + }, + { + "epoch": 1.3173241852487134, + "grad_norm": 3.182502269744873, + "learning_rate": 8.804403666584958e-06, + "loss": 0.8503, + "step": 2592 + }, + { + "epoch": 1.3178324121720348, + "grad_norm": 3.635493755340576, + "learning_rate": 8.803312499178722e-06, + "loss": 0.8862, + "step": 2593 + }, + { + "epoch": 1.318340639095356, + "grad_norm": 2.8551406860351562, + "learning_rate": 8.80222090175209e-06, + "loss": 0.7413, + "step": 2594 + }, + { + "epoch": 1.3188488660186772, + "grad_norm": 3.0634207725524902, + "learning_rate": 8.801128874428482e-06, + "loss": 0.9011, + "step": 2595 + }, + { + "epoch": 1.3193570929419987, + "grad_norm": 3.162566900253296, + "learning_rate": 8.800036417331372e-06, + "loss": 0.8009, + "step": 2596 + }, + { + "epoch": 1.3198653198653199, + "grad_norm": 3.1202633380889893, + "learning_rate": 8.798943530584275e-06, + "loss": 0.8532, + "step": 2597 + }, + { + "epoch": 1.320373546788641, + "grad_norm": 3.2355780601501465, + "learning_rate": 8.797850214310756e-06, + "loss": 0.8975, + "step": 2598 + }, + { + "epoch": 1.3208817737119625, + "grad_norm": 3.200838565826416, + "learning_rate": 8.796756468634436e-06, + "loss": 0.8297, + "step": 2599 + }, + { + "epoch": 1.3213900006352837, + "grad_norm": 3.2080655097961426, + "learning_rate": 8.795662293678976e-06, + "loss": 0.83, + "step": 2600 + }, + { + "epoch": 1.3218982275586049, + "grad_norm": 3.2180094718933105, + "learning_rate": 8.794567689568088e-06, + "loss": 0.9397, + "step": 2601 + }, + { + "epoch": 1.3224064544819263, + "grad_norm": 3.111396074295044, + "learning_rate": 8.793472656425533e-06, + "loss": 0.8781, + "step": 2602 + }, + { + "epoch": 1.3229146814052475, + "grad_norm": 3.1451263427734375, + "learning_rate": 8.792377194375123e-06, + "loss": 0.839, + "step": 2603 + }, + { + "epoch": 1.3234229083285687, + "grad_norm": 3.002424955368042, + "learning_rate": 8.791281303540714e-06, + "loss": 0.8521, + "step": 2604 + }, + { + "epoch": 1.32393113525189, + "grad_norm": 2.9210152626037598, + "learning_rate": 8.790184984046212e-06, + "loss": 0.8843, + "step": 2605 + }, + { + "epoch": 1.3244393621752113, + "grad_norm": 3.1625709533691406, + "learning_rate": 8.789088236015576e-06, + "loss": 0.871, + "step": 2606 + }, + { + "epoch": 1.3249475890985325, + "grad_norm": 3.112685441970825, + "learning_rate": 8.787991059572803e-06, + "loss": 0.7916, + "step": 2607 + }, + { + "epoch": 1.3254558160218537, + "grad_norm": 3.3765015602111816, + "learning_rate": 8.786893454841949e-06, + "loss": 0.8464, + "step": 2608 + }, + { + "epoch": 1.325964042945175, + "grad_norm": 3.056694507598877, + "learning_rate": 8.785795421947116e-06, + "loss": 0.8172, + "step": 2609 + }, + { + "epoch": 1.3264722698684963, + "grad_norm": 3.156933546066284, + "learning_rate": 8.784696961012448e-06, + "loss": 0.8663, + "step": 2610 + }, + { + "epoch": 1.3269804967918175, + "grad_norm": 2.98030161857605, + "learning_rate": 8.783598072162147e-06, + "loss": 0.7714, + "step": 2611 + }, + { + "epoch": 1.3274887237151387, + "grad_norm": 3.092323064804077, + "learning_rate": 8.782498755520457e-06, + "loss": 0.7489, + "step": 2612 + }, + { + "epoch": 1.3279969506384601, + "grad_norm": 3.140317916870117, + "learning_rate": 8.78139901121167e-06, + "loss": 0.8019, + "step": 2613 + }, + { + "epoch": 1.3285051775617813, + "grad_norm": 3.0484914779663086, + "learning_rate": 8.780298839360129e-06, + "loss": 0.9009, + "step": 2614 + }, + { + "epoch": 1.3290134044851025, + "grad_norm": 3.2454006671905518, + "learning_rate": 8.779198240090225e-06, + "loss": 0.8669, + "step": 2615 + }, + { + "epoch": 1.329521631408424, + "grad_norm": 3.0834341049194336, + "learning_rate": 8.778097213526398e-06, + "loss": 0.804, + "step": 2616 + }, + { + "epoch": 1.3300298583317451, + "grad_norm": 3.589625597000122, + "learning_rate": 8.776995759793132e-06, + "loss": 0.8648, + "step": 2617 + }, + { + "epoch": 1.3305380852550663, + "grad_norm": 2.9998013973236084, + "learning_rate": 8.775893879014968e-06, + "loss": 0.7427, + "step": 2618 + }, + { + "epoch": 1.3310463121783878, + "grad_norm": 3.2124462127685547, + "learning_rate": 8.774791571316484e-06, + "loss": 0.863, + "step": 2619 + }, + { + "epoch": 1.331554539101709, + "grad_norm": 3.1781957149505615, + "learning_rate": 8.773688836822317e-06, + "loss": 0.8429, + "step": 2620 + }, + { + "epoch": 1.3320627660250302, + "grad_norm": 3.172304391860962, + "learning_rate": 8.772585675657144e-06, + "loss": 0.882, + "step": 2621 + }, + { + "epoch": 1.3325709929483516, + "grad_norm": 2.9271175861358643, + "learning_rate": 8.771482087945693e-06, + "loss": 0.7754, + "step": 2622 + }, + { + "epoch": 1.3330792198716728, + "grad_norm": 3.295121669769287, + "learning_rate": 8.770378073812745e-06, + "loss": 0.7888, + "step": 2623 + }, + { + "epoch": 1.333587446794994, + "grad_norm": 3.0873188972473145, + "learning_rate": 8.769273633383122e-06, + "loss": 0.7987, + "step": 2624 + }, + { + "epoch": 1.3340956737183152, + "grad_norm": 3.130263090133667, + "learning_rate": 8.768168766781698e-06, + "loss": 0.8407, + "step": 2625 + }, + { + "epoch": 1.3346039006416364, + "grad_norm": 3.202841281890869, + "learning_rate": 8.767063474133392e-06, + "loss": 0.7984, + "step": 2626 + }, + { + "epoch": 1.3351121275649578, + "grad_norm": 2.8878347873687744, + "learning_rate": 8.765957755563177e-06, + "loss": 0.7478, + "step": 2627 + }, + { + "epoch": 1.335620354488279, + "grad_norm": 3.223191261291504, + "learning_rate": 8.76485161119607e-06, + "loss": 0.7901, + "step": 2628 + }, + { + "epoch": 1.3361285814116002, + "grad_norm": 3.7308144569396973, + "learning_rate": 8.763745041157136e-06, + "loss": 0.931, + "step": 2629 + }, + { + "epoch": 1.3366368083349216, + "grad_norm": 3.2447123527526855, + "learning_rate": 8.76263804557149e-06, + "loss": 0.9182, + "step": 2630 + }, + { + "epoch": 1.3371450352582428, + "grad_norm": 3.1200344562530518, + "learning_rate": 8.761530624564292e-06, + "loss": 0.7992, + "step": 2631 + }, + { + "epoch": 1.337653262181564, + "grad_norm": 3.198173761367798, + "learning_rate": 8.760422778260753e-06, + "loss": 0.8869, + "step": 2632 + }, + { + "epoch": 1.3381614891048854, + "grad_norm": 3.0903890132904053, + "learning_rate": 8.759314506786134e-06, + "loss": 0.8946, + "step": 2633 + }, + { + "epoch": 1.3386697160282066, + "grad_norm": 3.123501777648926, + "learning_rate": 8.75820581026574e-06, + "loss": 0.8356, + "step": 2634 + }, + { + "epoch": 1.3391779429515278, + "grad_norm": 3.3818912506103516, + "learning_rate": 8.757096688824925e-06, + "loss": 0.8841, + "step": 2635 + }, + { + "epoch": 1.3396861698748492, + "grad_norm": 3.03412127494812, + "learning_rate": 8.75598714258909e-06, + "loss": 0.8452, + "step": 2636 + }, + { + "epoch": 1.3401943967981704, + "grad_norm": 3.1534507274627686, + "learning_rate": 8.754877171683685e-06, + "loss": 0.9732, + "step": 2637 + }, + { + "epoch": 1.3407026237214916, + "grad_norm": 3.1218719482421875, + "learning_rate": 8.753766776234213e-06, + "loss": 0.8408, + "step": 2638 + }, + { + "epoch": 1.341210850644813, + "grad_norm": 3.4161899089813232, + "learning_rate": 8.752655956366217e-06, + "loss": 0.9102, + "step": 2639 + }, + { + "epoch": 1.3417190775681342, + "grad_norm": 3.1156539916992188, + "learning_rate": 8.751544712205293e-06, + "loss": 0.8302, + "step": 2640 + }, + { + "epoch": 1.3422273044914554, + "grad_norm": 3.08512020111084, + "learning_rate": 8.750433043877083e-06, + "loss": 0.8262, + "step": 2641 + }, + { + "epoch": 1.3427355314147766, + "grad_norm": 3.0877416133880615, + "learning_rate": 8.749320951507276e-06, + "loss": 0.8799, + "step": 2642 + }, + { + "epoch": 1.3432437583380978, + "grad_norm": 3.131417751312256, + "learning_rate": 8.748208435221614e-06, + "loss": 0.8745, + "step": 2643 + }, + { + "epoch": 1.3437519852614193, + "grad_norm": 3.1524205207824707, + "learning_rate": 8.747095495145878e-06, + "loss": 0.9559, + "step": 2644 + }, + { + "epoch": 1.3442602121847405, + "grad_norm": 3.236327648162842, + "learning_rate": 8.745982131405908e-06, + "loss": 0.8704, + "step": 2645 + }, + { + "epoch": 1.3447684391080617, + "grad_norm": 3.1059675216674805, + "learning_rate": 8.744868344127583e-06, + "loss": 0.8759, + "step": 2646 + }, + { + "epoch": 1.345276666031383, + "grad_norm": 3.2322580814361572, + "learning_rate": 8.743754133436832e-06, + "loss": 0.8551, + "step": 2647 + }, + { + "epoch": 1.3457848929547043, + "grad_norm": 3.067265510559082, + "learning_rate": 8.742639499459637e-06, + "loss": 0.8704, + "step": 2648 + }, + { + "epoch": 1.3462931198780255, + "grad_norm": 3.043553590774536, + "learning_rate": 8.74152444232202e-06, + "loss": 0.8944, + "step": 2649 + }, + { + "epoch": 1.3468013468013469, + "grad_norm": 2.9999492168426514, + "learning_rate": 8.740408962150055e-06, + "loss": 0.852, + "step": 2650 + }, + { + "epoch": 1.347309573724668, + "grad_norm": 3.1530864238739014, + "learning_rate": 8.739293059069864e-06, + "loss": 0.8197, + "step": 2651 + }, + { + "epoch": 1.3478178006479893, + "grad_norm": 3.741610527038574, + "learning_rate": 8.738176733207618e-06, + "loss": 0.8789, + "step": 2652 + }, + { + "epoch": 1.3483260275713107, + "grad_norm": 3.1385812759399414, + "learning_rate": 8.73705998468953e-06, + "loss": 0.8479, + "step": 2653 + }, + { + "epoch": 1.348834254494632, + "grad_norm": 3.3255321979522705, + "learning_rate": 8.735942813641869e-06, + "loss": 0.7281, + "step": 2654 + }, + { + "epoch": 1.349342481417953, + "grad_norm": 3.0691211223602295, + "learning_rate": 8.734825220190946e-06, + "loss": 0.8329, + "step": 2655 + }, + { + "epoch": 1.3498507083412743, + "grad_norm": 3.088752269744873, + "learning_rate": 8.733707204463121e-06, + "loss": 0.7821, + "step": 2656 + }, + { + "epoch": 1.3503589352645957, + "grad_norm": 3.136718511581421, + "learning_rate": 8.732588766584803e-06, + "loss": 0.9038, + "step": 2657 + }, + { + "epoch": 1.350867162187917, + "grad_norm": 2.992579460144043, + "learning_rate": 8.731469906682445e-06, + "loss": 0.8415, + "step": 2658 + }, + { + "epoch": 1.3513753891112381, + "grad_norm": 3.259535312652588, + "learning_rate": 8.730350624882557e-06, + "loss": 0.9561, + "step": 2659 + }, + { + "epoch": 1.3518836160345593, + "grad_norm": 3.0274555683135986, + "learning_rate": 8.729230921311682e-06, + "loss": 0.8513, + "step": 2660 + }, + { + "epoch": 1.3523918429578807, + "grad_norm": 3.5799143314361572, + "learning_rate": 8.728110796096426e-06, + "loss": 0.844, + "step": 2661 + }, + { + "epoch": 1.352900069881202, + "grad_norm": 3.2173969745635986, + "learning_rate": 8.726990249363432e-06, + "loss": 0.8714, + "step": 2662 + }, + { + "epoch": 1.3534082968045231, + "grad_norm": 3.0594699382781982, + "learning_rate": 8.725869281239395e-06, + "loss": 0.9004, + "step": 2663 + }, + { + "epoch": 1.3539165237278445, + "grad_norm": 2.9932353496551514, + "learning_rate": 8.724747891851055e-06, + "loss": 0.7776, + "step": 2664 + }, + { + "epoch": 1.3544247506511657, + "grad_norm": 3.293879270553589, + "learning_rate": 8.723626081325205e-06, + "loss": 0.8032, + "step": 2665 + }, + { + "epoch": 1.354932977574487, + "grad_norm": 3.299185037612915, + "learning_rate": 8.722503849788679e-06, + "loss": 0.9281, + "step": 2666 + }, + { + "epoch": 1.3554412044978084, + "grad_norm": 3.27127742767334, + "learning_rate": 8.721381197368366e-06, + "loss": 0.8855, + "step": 2667 + }, + { + "epoch": 1.3559494314211296, + "grad_norm": 3.081345319747925, + "learning_rate": 8.720258124191195e-06, + "loss": 0.8455, + "step": 2668 + }, + { + "epoch": 1.3564576583444508, + "grad_norm": 3.182535409927368, + "learning_rate": 8.719134630384144e-06, + "loss": 0.8738, + "step": 2669 + }, + { + "epoch": 1.3569658852677722, + "grad_norm": 3.1837494373321533, + "learning_rate": 8.718010716074246e-06, + "loss": 0.8641, + "step": 2670 + }, + { + "epoch": 1.3574741121910934, + "grad_norm": 3.0172135829925537, + "learning_rate": 8.716886381388573e-06, + "loss": 0.8186, + "step": 2671 + }, + { + "epoch": 1.3579823391144146, + "grad_norm": 3.1252171993255615, + "learning_rate": 8.715761626454248e-06, + "loss": 0.8675, + "step": 2672 + }, + { + "epoch": 1.3584905660377358, + "grad_norm": 3.1834468841552734, + "learning_rate": 8.71463645139844e-06, + "loss": 0.912, + "step": 2673 + }, + { + "epoch": 1.3589987929610572, + "grad_norm": 3.274007797241211, + "learning_rate": 8.713510856348368e-06, + "loss": 0.8753, + "step": 2674 + }, + { + "epoch": 1.3595070198843784, + "grad_norm": 3.550733804702759, + "learning_rate": 8.712384841431296e-06, + "loss": 0.8694, + "step": 2675 + }, + { + "epoch": 1.3600152468076996, + "grad_norm": 3.228518486022949, + "learning_rate": 8.711258406774536e-06, + "loss": 0.8589, + "step": 2676 + }, + { + "epoch": 1.3605234737310208, + "grad_norm": 3.438473701477051, + "learning_rate": 8.71013155250545e-06, + "loss": 0.8953, + "step": 2677 + }, + { + "epoch": 1.3610317006543422, + "grad_norm": 3.2976551055908203, + "learning_rate": 8.709004278751445e-06, + "loss": 0.8868, + "step": 2678 + }, + { + "epoch": 1.3615399275776634, + "grad_norm": 3.1462578773498535, + "learning_rate": 8.707876585639977e-06, + "loss": 0.8054, + "step": 2679 + }, + { + "epoch": 1.3620481545009846, + "grad_norm": 2.89199161529541, + "learning_rate": 8.706748473298544e-06, + "loss": 0.7397, + "step": 2680 + }, + { + "epoch": 1.362556381424306, + "grad_norm": 3.5015709400177, + "learning_rate": 8.705619941854698e-06, + "loss": 0.8578, + "step": 2681 + }, + { + "epoch": 1.3630646083476272, + "grad_norm": 3.020496368408203, + "learning_rate": 8.70449099143604e-06, + "loss": 0.8183, + "step": 2682 + }, + { + "epoch": 1.3635728352709484, + "grad_norm": 3.3509302139282227, + "learning_rate": 8.703361622170205e-06, + "loss": 0.7856, + "step": 2683 + }, + { + "epoch": 1.3640810621942698, + "grad_norm": 3.096768379211426, + "learning_rate": 8.702231834184895e-06, + "loss": 0.9488, + "step": 2684 + }, + { + "epoch": 1.364589289117591, + "grad_norm": 3.023076295852661, + "learning_rate": 8.701101627607844e-06, + "loss": 0.8422, + "step": 2685 + }, + { + "epoch": 1.3650975160409122, + "grad_norm": 4.890537738800049, + "learning_rate": 8.699971002566839e-06, + "loss": 0.838, + "step": 2686 + }, + { + "epoch": 1.3656057429642336, + "grad_norm": 3.220949172973633, + "learning_rate": 8.698839959189714e-06, + "loss": 0.8532, + "step": 2687 + }, + { + "epoch": 1.3661139698875548, + "grad_norm": 2.687530994415283, + "learning_rate": 8.697708497604352e-06, + "loss": 0.7821, + "step": 2688 + }, + { + "epoch": 1.366622196810876, + "grad_norm": 3.0187814235687256, + "learning_rate": 8.696576617938677e-06, + "loss": 0.9102, + "step": 2689 + }, + { + "epoch": 1.3671304237341972, + "grad_norm": 3.226120948791504, + "learning_rate": 8.695444320320668e-06, + "loss": 0.8591, + "step": 2690 + }, + { + "epoch": 1.3676386506575187, + "grad_norm": 3.4441635608673096, + "learning_rate": 8.694311604878346e-06, + "loss": 0.9067, + "step": 2691 + }, + { + "epoch": 1.3681468775808399, + "grad_norm": 3.1548378467559814, + "learning_rate": 8.693178471739782e-06, + "loss": 0.7731, + "step": 2692 + }, + { + "epoch": 1.368655104504161, + "grad_norm": 2.9003067016601562, + "learning_rate": 8.692044921033096e-06, + "loss": 0.7738, + "step": 2693 + }, + { + "epoch": 1.3691633314274823, + "grad_norm": 3.099714756011963, + "learning_rate": 8.690910952886449e-06, + "loss": 0.7917, + "step": 2694 + }, + { + "epoch": 1.3696715583508037, + "grad_norm": 3.210352897644043, + "learning_rate": 8.689776567428053e-06, + "loss": 0.8826, + "step": 2695 + }, + { + "epoch": 1.3701797852741249, + "grad_norm": 3.1537983417510986, + "learning_rate": 8.688641764786167e-06, + "loss": 0.8355, + "step": 2696 + }, + { + "epoch": 1.370688012197446, + "grad_norm": 3.399169683456421, + "learning_rate": 8.6875065450891e-06, + "loss": 0.9821, + "step": 2697 + }, + { + "epoch": 1.3711962391207675, + "grad_norm": 3.2011547088623047, + "learning_rate": 8.686370908465204e-06, + "loss": 0.8729, + "step": 2698 + }, + { + "epoch": 1.3717044660440887, + "grad_norm": 3.188690185546875, + "learning_rate": 8.685234855042876e-06, + "loss": 0.8369, + "step": 2699 + }, + { + "epoch": 1.3722126929674099, + "grad_norm": 4.217759132385254, + "learning_rate": 8.684098384950567e-06, + "loss": 0.8288, + "step": 2700 + }, + { + "epoch": 1.3727209198907313, + "grad_norm": 3.447901964187622, + "learning_rate": 8.682961498316772e-06, + "loss": 0.8944, + "step": 2701 + }, + { + "epoch": 1.3732291468140525, + "grad_norm": 2.8357911109924316, + "learning_rate": 8.68182419527003e-06, + "loss": 0.8125, + "step": 2702 + }, + { + "epoch": 1.3737373737373737, + "grad_norm": 2.925048828125, + "learning_rate": 8.680686475938933e-06, + "loss": 0.7786, + "step": 2703 + }, + { + "epoch": 1.3742456006606951, + "grad_norm": 3.1883702278137207, + "learning_rate": 8.679548340452115e-06, + "loss": 0.7921, + "step": 2704 + }, + { + "epoch": 1.3747538275840163, + "grad_norm": 3.2614142894744873, + "learning_rate": 8.678409788938259e-06, + "loss": 0.8351, + "step": 2705 + }, + { + "epoch": 1.3752620545073375, + "grad_norm": 3.193164825439453, + "learning_rate": 8.677270821526095e-06, + "loss": 0.7844, + "step": 2706 + }, + { + "epoch": 1.3757702814306587, + "grad_norm": 3.2156474590301514, + "learning_rate": 8.6761314383444e-06, + "loss": 0.8201, + "step": 2707 + }, + { + "epoch": 1.3762785083539801, + "grad_norm": 2.989922523498535, + "learning_rate": 8.674991639521997e-06, + "loss": 0.8055, + "step": 2708 + }, + { + "epoch": 1.3767867352773013, + "grad_norm": 3.1420819759368896, + "learning_rate": 8.673851425187762e-06, + "loss": 0.9387, + "step": 2709 + }, + { + "epoch": 1.3772949622006225, + "grad_norm": 2.995516061782837, + "learning_rate": 8.672710795470606e-06, + "loss": 0.8184, + "step": 2710 + }, + { + "epoch": 1.3778031891239437, + "grad_norm": 3.6818063259124756, + "learning_rate": 8.6715697504995e-06, + "loss": 0.9301, + "step": 2711 + }, + { + "epoch": 1.3783114160472651, + "grad_norm": 3.0470900535583496, + "learning_rate": 8.67042829040345e-06, + "loss": 0.8822, + "step": 2712 + }, + { + "epoch": 1.3788196429705863, + "grad_norm": 3.0707991123199463, + "learning_rate": 8.66928641531152e-06, + "loss": 0.8192, + "step": 2713 + }, + { + "epoch": 1.3793278698939075, + "grad_norm": 3.1534693241119385, + "learning_rate": 8.668144125352814e-06, + "loss": 0.7877, + "step": 2714 + }, + { + "epoch": 1.379836096817229, + "grad_norm": 3.1589243412017822, + "learning_rate": 8.667001420656482e-06, + "loss": 0.8504, + "step": 2715 + }, + { + "epoch": 1.3803443237405502, + "grad_norm": 3.279162645339966, + "learning_rate": 8.665858301351728e-06, + "loss": 0.9218, + "step": 2716 + }, + { + "epoch": 1.3808525506638714, + "grad_norm": 3.084298610687256, + "learning_rate": 8.664714767567796e-06, + "loss": 0.8225, + "step": 2717 + }, + { + "epoch": 1.3813607775871928, + "grad_norm": 3.2460992336273193, + "learning_rate": 8.66357081943398e-06, + "loss": 0.8463, + "step": 2718 + }, + { + "epoch": 1.381869004510514, + "grad_norm": 3.2598676681518555, + "learning_rate": 8.662426457079622e-06, + "loss": 0.9005, + "step": 2719 + }, + { + "epoch": 1.3823772314338352, + "grad_norm": 3.0160598754882812, + "learning_rate": 8.661281680634103e-06, + "loss": 0.8236, + "step": 2720 + }, + { + "epoch": 1.3828854583571566, + "grad_norm": 3.1025872230529785, + "learning_rate": 8.660136490226863e-06, + "loss": 0.8245, + "step": 2721 + }, + { + "epoch": 1.3833936852804778, + "grad_norm": 3.3537919521331787, + "learning_rate": 8.65899088598738e-06, + "loss": 0.9065, + "step": 2722 + }, + { + "epoch": 1.383901912203799, + "grad_norm": 3.2307286262512207, + "learning_rate": 8.657844868045182e-06, + "loss": 0.7384, + "step": 2723 + }, + { + "epoch": 1.3844101391271202, + "grad_norm": 3.2937235832214355, + "learning_rate": 8.656698436529843e-06, + "loss": 0.8946, + "step": 2724 + }, + { + "epoch": 1.3849183660504414, + "grad_norm": 3.5228772163391113, + "learning_rate": 8.655551591570983e-06, + "loss": 0.97, + "step": 2725 + }, + { + "epoch": 1.3854265929737628, + "grad_norm": 3.1984856128692627, + "learning_rate": 8.65440433329827e-06, + "loss": 0.8, + "step": 2726 + }, + { + "epoch": 1.385934819897084, + "grad_norm": 3.3704750537872314, + "learning_rate": 8.65325666184142e-06, + "loss": 0.9496, + "step": 2727 + }, + { + "epoch": 1.3864430468204052, + "grad_norm": 3.2403101921081543, + "learning_rate": 8.652108577330194e-06, + "loss": 0.7782, + "step": 2728 + }, + { + "epoch": 1.3869512737437266, + "grad_norm": 3.0873589515686035, + "learning_rate": 8.650960079894397e-06, + "loss": 0.7821, + "step": 2729 + }, + { + "epoch": 1.3874595006670478, + "grad_norm": 3.159641742706299, + "learning_rate": 8.649811169663886e-06, + "loss": 0.8486, + "step": 2730 + }, + { + "epoch": 1.387967727590369, + "grad_norm": 3.6541502475738525, + "learning_rate": 8.648661846768562e-06, + "loss": 0.8905, + "step": 2731 + }, + { + "epoch": 1.3884759545136904, + "grad_norm": 2.725341558456421, + "learning_rate": 8.647512111338374e-06, + "loss": 0.7955, + "step": 2732 + }, + { + "epoch": 1.3889841814370116, + "grad_norm": 3.1985182762145996, + "learning_rate": 8.646361963503312e-06, + "loss": 0.7561, + "step": 2733 + }, + { + "epoch": 1.3894924083603328, + "grad_norm": 2.953597068786621, + "learning_rate": 8.645211403393422e-06, + "loss": 0.9021, + "step": 2734 + }, + { + "epoch": 1.3900006352836543, + "grad_norm": 3.17386794090271, + "learning_rate": 8.644060431138789e-06, + "loss": 0.8701, + "step": 2735 + }, + { + "epoch": 1.3905088622069754, + "grad_norm": 3.1918575763702393, + "learning_rate": 8.64290904686955e-06, + "loss": 0.7802, + "step": 2736 + }, + { + "epoch": 1.3910170891302966, + "grad_norm": 3.179152488708496, + "learning_rate": 8.64175725071588e-06, + "loss": 0.826, + "step": 2737 + }, + { + "epoch": 1.391525316053618, + "grad_norm": 3.167999505996704, + "learning_rate": 8.640605042808015e-06, + "loss": 0.9195, + "step": 2738 + }, + { + "epoch": 1.3920335429769393, + "grad_norm": 3.178011655807495, + "learning_rate": 8.639452423276222e-06, + "loss": 0.8234, + "step": 2739 + }, + { + "epoch": 1.3925417699002605, + "grad_norm": 3.097113609313965, + "learning_rate": 8.638299392250825e-06, + "loss": 0.8382, + "step": 2740 + }, + { + "epoch": 1.3930499968235817, + "grad_norm": 2.9893417358398438, + "learning_rate": 8.63714594986219e-06, + "loss": 0.822, + "step": 2741 + }, + { + "epoch": 1.3935582237469029, + "grad_norm": 3.445077419281006, + "learning_rate": 8.63599209624073e-06, + "loss": 0.8855, + "step": 2742 + }, + { + "epoch": 1.3940664506702243, + "grad_norm": 3.340830087661743, + "learning_rate": 8.634837831516908e-06, + "loss": 0.8562, + "step": 2743 + }, + { + "epoch": 1.3945746775935455, + "grad_norm": 3.0364067554473877, + "learning_rate": 8.633683155821228e-06, + "loss": 0.836, + "step": 2744 + }, + { + "epoch": 1.3950829045168667, + "grad_norm": 3.1018741130828857, + "learning_rate": 8.632528069284243e-06, + "loss": 0.8154, + "step": 2745 + }, + { + "epoch": 1.395591131440188, + "grad_norm": 3.1715431213378906, + "learning_rate": 8.631372572036554e-06, + "loss": 0.9054, + "step": 2746 + }, + { + "epoch": 1.3960993583635093, + "grad_norm": 3.1135804653167725, + "learning_rate": 8.630216664208807e-06, + "loss": 0.7402, + "step": 2747 + }, + { + "epoch": 1.3966075852868305, + "grad_norm": 3.0619115829467773, + "learning_rate": 8.629060345931692e-06, + "loss": 0.8012, + "step": 2748 + }, + { + "epoch": 1.397115812210152, + "grad_norm": 3.196671962738037, + "learning_rate": 8.62790361733595e-06, + "loss": 1.0199, + "step": 2749 + }, + { + "epoch": 1.397624039133473, + "grad_norm": 3.023580312728882, + "learning_rate": 8.626746478552364e-06, + "loss": 0.8694, + "step": 2750 + }, + { + "epoch": 1.3981322660567943, + "grad_norm": 3.1226820945739746, + "learning_rate": 8.625588929711769e-06, + "loss": 0.8368, + "step": 2751 + }, + { + "epoch": 1.3986404929801157, + "grad_norm": 3.6180248260498047, + "learning_rate": 8.624430970945042e-06, + "loss": 0.8729, + "step": 2752 + }, + { + "epoch": 1.399148719903437, + "grad_norm": 3.0566389560699463, + "learning_rate": 8.623272602383104e-06, + "loss": 0.8592, + "step": 2753 + }, + { + "epoch": 1.3996569468267581, + "grad_norm": 2.938758373260498, + "learning_rate": 8.622113824156927e-06, + "loss": 0.7979, + "step": 2754 + }, + { + "epoch": 1.4001651737500795, + "grad_norm": 3.0424911975860596, + "learning_rate": 8.62095463639753e-06, + "loss": 0.8087, + "step": 2755 + }, + { + "epoch": 1.4006734006734007, + "grad_norm": 3.3442065715789795, + "learning_rate": 8.619795039235977e-06, + "loss": 0.8459, + "step": 2756 + }, + { + "epoch": 1.401181627596722, + "grad_norm": 3.2160093784332275, + "learning_rate": 8.618635032803373e-06, + "loss": 0.9036, + "step": 2757 + }, + { + "epoch": 1.4016898545200431, + "grad_norm": 3.39898681640625, + "learning_rate": 8.617474617230876e-06, + "loss": 0.9047, + "step": 2758 + }, + { + "epoch": 1.4021980814433643, + "grad_norm": 2.9836056232452393, + "learning_rate": 8.61631379264969e-06, + "loss": 0.8554, + "step": 2759 + }, + { + "epoch": 1.4027063083666858, + "grad_norm": 3.0101606845855713, + "learning_rate": 8.61515255919106e-06, + "loss": 0.8432, + "step": 2760 + }, + { + "epoch": 1.403214535290007, + "grad_norm": 3.043668270111084, + "learning_rate": 8.613990916986283e-06, + "loss": 0.8153, + "step": 2761 + }, + { + "epoch": 1.4037227622133281, + "grad_norm": 3.441566228866577, + "learning_rate": 8.6128288661667e-06, + "loss": 0.9139, + "step": 2762 + }, + { + "epoch": 1.4042309891366496, + "grad_norm": 3.1094048023223877, + "learning_rate": 8.611666406863695e-06, + "loss": 0.8962, + "step": 2763 + }, + { + "epoch": 1.4047392160599708, + "grad_norm": 3.3947198390960693, + "learning_rate": 8.610503539208704e-06, + "loss": 0.8963, + "step": 2764 + }, + { + "epoch": 1.405247442983292, + "grad_norm": 3.0119621753692627, + "learning_rate": 8.609340263333204e-06, + "loss": 0.7885, + "step": 2765 + }, + { + "epoch": 1.4057556699066134, + "grad_norm": 3.0325357913970947, + "learning_rate": 8.608176579368721e-06, + "loss": 0.8552, + "step": 2766 + }, + { + "epoch": 1.4062638968299346, + "grad_norm": 3.492356300354004, + "learning_rate": 8.60701248744683e-06, + "loss": 0.8615, + "step": 2767 + }, + { + "epoch": 1.4067721237532558, + "grad_norm": 3.209897756576538, + "learning_rate": 8.605847987699143e-06, + "loss": 0.8475, + "step": 2768 + }, + { + "epoch": 1.4072803506765772, + "grad_norm": 3.118128538131714, + "learning_rate": 8.604683080257328e-06, + "loss": 0.8113, + "step": 2769 + }, + { + "epoch": 1.4077885775998984, + "grad_norm": 3.1163711547851562, + "learning_rate": 8.603517765253093e-06, + "loss": 0.9601, + "step": 2770 + }, + { + "epoch": 1.4082968045232196, + "grad_norm": 3.1078336238861084, + "learning_rate": 8.602352042818196e-06, + "loss": 0.7957, + "step": 2771 + }, + { + "epoch": 1.408805031446541, + "grad_norm": 3.149662494659424, + "learning_rate": 8.601185913084435e-06, + "loss": 0.8792, + "step": 2772 + }, + { + "epoch": 1.4093132583698622, + "grad_norm": 2.814724922180176, + "learning_rate": 8.600019376183664e-06, + "loss": 0.8117, + "step": 2773 + }, + { + "epoch": 1.4098214852931834, + "grad_norm": 3.325305938720703, + "learning_rate": 8.598852432247773e-06, + "loss": 0.9079, + "step": 2774 + }, + { + "epoch": 1.4103297122165046, + "grad_norm": 3.1834630966186523, + "learning_rate": 8.597685081408702e-06, + "loss": 0.7996, + "step": 2775 + }, + { + "epoch": 1.4108379391398258, + "grad_norm": 3.0160608291625977, + "learning_rate": 8.596517323798439e-06, + "loss": 0.8563, + "step": 2776 + }, + { + "epoch": 1.4113461660631472, + "grad_norm": 3.034503936767578, + "learning_rate": 8.595349159549014e-06, + "loss": 0.8282, + "step": 2777 + }, + { + "epoch": 1.4118543929864684, + "grad_norm": 3.2270278930664062, + "learning_rate": 8.594180588792509e-06, + "loss": 0.8111, + "step": 2778 + }, + { + "epoch": 1.4123626199097896, + "grad_norm": 3.277219772338867, + "learning_rate": 8.593011611661044e-06, + "loss": 0.7967, + "step": 2779 + }, + { + "epoch": 1.412870846833111, + "grad_norm": 3.335444211959839, + "learning_rate": 8.59184222828679e-06, + "loss": 0.8529, + "step": 2780 + }, + { + "epoch": 1.4133790737564322, + "grad_norm": 3.420228958129883, + "learning_rate": 8.590672438801966e-06, + "loss": 0.9701, + "step": 2781 + }, + { + "epoch": 1.4138873006797534, + "grad_norm": 3.2469561100006104, + "learning_rate": 8.58950224333883e-06, + "loss": 0.8626, + "step": 2782 + }, + { + "epoch": 1.4143955276030749, + "grad_norm": 3.1776680946350098, + "learning_rate": 8.588331642029693e-06, + "loss": 0.9284, + "step": 2783 + }, + { + "epoch": 1.414903754526396, + "grad_norm": 3.105638027191162, + "learning_rate": 8.587160635006906e-06, + "loss": 0.8902, + "step": 2784 + }, + { + "epoch": 1.4154119814497172, + "grad_norm": 3.259697675704956, + "learning_rate": 8.585989222402871e-06, + "loss": 0.814, + "step": 2785 + }, + { + "epoch": 1.4159202083730387, + "grad_norm": 2.953216791152954, + "learning_rate": 8.58481740435003e-06, + "loss": 0.7898, + "step": 2786 + }, + { + "epoch": 1.4164284352963599, + "grad_norm": 3.1166532039642334, + "learning_rate": 8.583645180980878e-06, + "loss": 0.7499, + "step": 2787 + }, + { + "epoch": 1.416936662219681, + "grad_norm": 3.0191895961761475, + "learning_rate": 8.582472552427949e-06, + "loss": 0.7992, + "step": 2788 + }, + { + "epoch": 1.4174448891430025, + "grad_norm": 3.2020316123962402, + "learning_rate": 8.581299518823829e-06, + "loss": 0.7971, + "step": 2789 + }, + { + "epoch": 1.4179531160663237, + "grad_norm": 3.126887083053589, + "learning_rate": 8.580126080301143e-06, + "loss": 0.7992, + "step": 2790 + }, + { + "epoch": 1.4184613429896449, + "grad_norm": 3.4426639080047607, + "learning_rate": 8.578952236992569e-06, + "loss": 0.9443, + "step": 2791 + }, + { + "epoch": 1.418969569912966, + "grad_norm": 3.0545034408569336, + "learning_rate": 8.577777989030826e-06, + "loss": 0.7823, + "step": 2792 + }, + { + "epoch": 1.4194777968362873, + "grad_norm": 3.326939821243286, + "learning_rate": 8.576603336548679e-06, + "loss": 0.8822, + "step": 2793 + }, + { + "epoch": 1.4199860237596087, + "grad_norm": 3.2515408992767334, + "learning_rate": 8.575428279678942e-06, + "loss": 0.9458, + "step": 2794 + }, + { + "epoch": 1.42049425068293, + "grad_norm": 3.2859838008880615, + "learning_rate": 8.574252818554469e-06, + "loss": 0.8204, + "step": 2795 + }, + { + "epoch": 1.421002477606251, + "grad_norm": 3.3892626762390137, + "learning_rate": 8.573076953308164e-06, + "loss": 0.9016, + "step": 2796 + }, + { + "epoch": 1.4215107045295725, + "grad_norm": 3.129750967025757, + "learning_rate": 8.57190068407298e-06, + "loss": 0.7464, + "step": 2797 + }, + { + "epoch": 1.4220189314528937, + "grad_norm": 3.18557071685791, + "learning_rate": 8.570724010981907e-06, + "loss": 0.8757, + "step": 2798 + }, + { + "epoch": 1.422527158376215, + "grad_norm": 3.095346450805664, + "learning_rate": 8.569546934167986e-06, + "loss": 0.7698, + "step": 2799 + }, + { + "epoch": 1.4230353852995363, + "grad_norm": 3.1986424922943115, + "learning_rate": 8.568369453764304e-06, + "loss": 0.8281, + "step": 2800 + }, + { + "epoch": 1.4235436122228575, + "grad_norm": 3.0349645614624023, + "learning_rate": 8.567191569903993e-06, + "loss": 0.8225, + "step": 2801 + }, + { + "epoch": 1.4240518391461787, + "grad_norm": 3.03617000579834, + "learning_rate": 8.566013282720227e-06, + "loss": 0.8585, + "step": 2802 + }, + { + "epoch": 1.4245600660695001, + "grad_norm": 2.9680211544036865, + "learning_rate": 8.564834592346235e-06, + "loss": 0.7789, + "step": 2803 + }, + { + "epoch": 1.4250682929928213, + "grad_norm": 2.939490795135498, + "learning_rate": 8.563655498915277e-06, + "loss": 0.8843, + "step": 2804 + }, + { + "epoch": 1.4255765199161425, + "grad_norm": 3.2486467361450195, + "learning_rate": 8.562476002560671e-06, + "loss": 0.8049, + "step": 2805 + }, + { + "epoch": 1.426084746839464, + "grad_norm": 2.8949148654937744, + "learning_rate": 8.561296103415777e-06, + "loss": 0.7904, + "step": 2806 + }, + { + "epoch": 1.4265929737627852, + "grad_norm": 3.06335711479187, + "learning_rate": 8.560115801614e-06, + "loss": 0.8296, + "step": 2807 + }, + { + "epoch": 1.4271012006861064, + "grad_norm": 3.0824975967407227, + "learning_rate": 8.55893509728879e-06, + "loss": 0.8573, + "step": 2808 + }, + { + "epoch": 1.4276094276094276, + "grad_norm": 3.0061516761779785, + "learning_rate": 8.557753990573642e-06, + "loss": 0.7923, + "step": 2809 + }, + { + "epoch": 1.4281176545327487, + "grad_norm": 3.269150495529175, + "learning_rate": 8.556572481602097e-06, + "loss": 0.939, + "step": 2810 + }, + { + "epoch": 1.4286258814560702, + "grad_norm": 3.064577102661133, + "learning_rate": 8.555390570507746e-06, + "loss": 0.8354, + "step": 2811 + }, + { + "epoch": 1.4291341083793914, + "grad_norm": 3.408207416534424, + "learning_rate": 8.554208257424216e-06, + "loss": 0.861, + "step": 2812 + }, + { + "epoch": 1.4296423353027126, + "grad_norm": 3.1423888206481934, + "learning_rate": 8.553025542485188e-06, + "loss": 0.8399, + "step": 2813 + }, + { + "epoch": 1.430150562226034, + "grad_norm": 3.00049090385437, + "learning_rate": 8.551842425824386e-06, + "loss": 0.8831, + "step": 2814 + }, + { + "epoch": 1.4306587891493552, + "grad_norm": 3.9325108528137207, + "learning_rate": 8.550658907575575e-06, + "loss": 0.871, + "step": 2815 + }, + { + "epoch": 1.4311670160726764, + "grad_norm": 3.3278439044952393, + "learning_rate": 8.549474987872575e-06, + "loss": 0.8385, + "step": 2816 + }, + { + "epoch": 1.4316752429959978, + "grad_norm": 3.1003921031951904, + "learning_rate": 8.54829066684924e-06, + "loss": 0.7442, + "step": 2817 + }, + { + "epoch": 1.432183469919319, + "grad_norm": 3.381220579147339, + "learning_rate": 8.547105944639476e-06, + "loss": 0.8432, + "step": 2818 + }, + { + "epoch": 1.4326916968426402, + "grad_norm": 3.1350619792938232, + "learning_rate": 8.545920821377236e-06, + "loss": 0.8929, + "step": 2819 + }, + { + "epoch": 1.4331999237659616, + "grad_norm": 3.075319766998291, + "learning_rate": 8.544735297196514e-06, + "loss": 0.8004, + "step": 2820 + }, + { + "epoch": 1.4337081506892828, + "grad_norm": 3.096254348754883, + "learning_rate": 8.54354937223135e-06, + "loss": 0.8188, + "step": 2821 + }, + { + "epoch": 1.434216377612604, + "grad_norm": 3.446495532989502, + "learning_rate": 8.542363046615832e-06, + "loss": 0.8236, + "step": 2822 + }, + { + "epoch": 1.4347246045359252, + "grad_norm": 3.2281386852264404, + "learning_rate": 8.54117632048409e-06, + "loss": 0.8753, + "step": 2823 + }, + { + "epoch": 1.4352328314592466, + "grad_norm": 3.3451106548309326, + "learning_rate": 8.539989193970302e-06, + "loss": 0.8476, + "step": 2824 + }, + { + "epoch": 1.4357410583825678, + "grad_norm": 3.919847011566162, + "learning_rate": 8.538801667208689e-06, + "loss": 0.8938, + "step": 2825 + }, + { + "epoch": 1.436249285305889, + "grad_norm": 3.22807240486145, + "learning_rate": 8.53761374033352e-06, + "loss": 0.8215, + "step": 2826 + }, + { + "epoch": 1.4367575122292102, + "grad_norm": 3.2741971015930176, + "learning_rate": 8.536425413479106e-06, + "loss": 0.9306, + "step": 2827 + }, + { + "epoch": 1.4372657391525316, + "grad_norm": 3.3959178924560547, + "learning_rate": 8.535236686779803e-06, + "loss": 0.8611, + "step": 2828 + }, + { + "epoch": 1.4377739660758528, + "grad_norm": 3.349571943283081, + "learning_rate": 8.53404756037002e-06, + "loss": 0.8705, + "step": 2829 + }, + { + "epoch": 1.438282192999174, + "grad_norm": 3.0857625007629395, + "learning_rate": 8.5328580343842e-06, + "loss": 0.8817, + "step": 2830 + }, + { + "epoch": 1.4387904199224955, + "grad_norm": 3.328871965408325, + "learning_rate": 8.531668108956839e-06, + "loss": 0.8801, + "step": 2831 + }, + { + "epoch": 1.4392986468458167, + "grad_norm": 3.0159804821014404, + "learning_rate": 8.530477784222474e-06, + "loss": 0.8405, + "step": 2832 + }, + { + "epoch": 1.4398068737691379, + "grad_norm": 3.806766986846924, + "learning_rate": 8.529287060315689e-06, + "loss": 0.7828, + "step": 2833 + }, + { + "epoch": 1.4403151006924593, + "grad_norm": 3.1105751991271973, + "learning_rate": 8.528095937371114e-06, + "loss": 0.8531, + "step": 2834 + }, + { + "epoch": 1.4408233276157805, + "grad_norm": 3.2140769958496094, + "learning_rate": 8.52690441552342e-06, + "loss": 0.9142, + "step": 2835 + }, + { + "epoch": 1.4413315545391017, + "grad_norm": 3.303377151489258, + "learning_rate": 8.525712494907331e-06, + "loss": 0.8428, + "step": 2836 + }, + { + "epoch": 1.441839781462423, + "grad_norm": 3.3976967334747314, + "learning_rate": 8.524520175657607e-06, + "loss": 0.9415, + "step": 2837 + }, + { + "epoch": 1.4423480083857443, + "grad_norm": 3.5745909214019775, + "learning_rate": 8.52332745790906e-06, + "loss": 0.8693, + "step": 2838 + }, + { + "epoch": 1.4428562353090655, + "grad_norm": 3.0088138580322266, + "learning_rate": 8.522134341796541e-06, + "loss": 0.7789, + "step": 2839 + }, + { + "epoch": 1.4433644622323867, + "grad_norm": 3.2750589847564697, + "learning_rate": 8.52094082745495e-06, + "loss": 0.8578, + "step": 2840 + }, + { + "epoch": 1.443872689155708, + "grad_norm": 3.0049092769622803, + "learning_rate": 8.519746915019235e-06, + "loss": 0.8935, + "step": 2841 + }, + { + "epoch": 1.4443809160790293, + "grad_norm": 3.0418643951416016, + "learning_rate": 8.518552604624383e-06, + "loss": 0.8245, + "step": 2842 + }, + { + "epoch": 1.4448891430023505, + "grad_norm": 3.2596395015716553, + "learning_rate": 8.517357896405427e-06, + "loss": 0.8868, + "step": 2843 + }, + { + "epoch": 1.4453973699256717, + "grad_norm": 2.954144239425659, + "learning_rate": 8.516162790497448e-06, + "loss": 0.8098, + "step": 2844 + }, + { + "epoch": 1.4459055968489931, + "grad_norm": 3.078198194503784, + "learning_rate": 8.51496728703557e-06, + "loss": 0.9043, + "step": 2845 + }, + { + "epoch": 1.4464138237723143, + "grad_norm": 3.0612032413482666, + "learning_rate": 8.51377138615496e-06, + "loss": 0.7907, + "step": 2846 + }, + { + "epoch": 1.4469220506956355, + "grad_norm": 3.0762479305267334, + "learning_rate": 8.512575087990838e-06, + "loss": 0.8781, + "step": 2847 + }, + { + "epoch": 1.447430277618957, + "grad_norm": 3.2731642723083496, + "learning_rate": 8.511378392678456e-06, + "loss": 0.8208, + "step": 2848 + }, + { + "epoch": 1.4479385045422781, + "grad_norm": 2.9340736865997314, + "learning_rate": 8.510181300353123e-06, + "loss": 0.7683, + "step": 2849 + }, + { + "epoch": 1.4484467314655993, + "grad_norm": 3.1629176139831543, + "learning_rate": 8.508983811150187e-06, + "loss": 0.8628, + "step": 2850 + }, + { + "epoch": 1.4489549583889207, + "grad_norm": 3.1435041427612305, + "learning_rate": 8.50778592520504e-06, + "loss": 0.8533, + "step": 2851 + }, + { + "epoch": 1.449463185312242, + "grad_norm": 3.251697063446045, + "learning_rate": 8.506587642653122e-06, + "loss": 0.8611, + "step": 2852 + }, + { + "epoch": 1.4499714122355631, + "grad_norm": 3.0637731552124023, + "learning_rate": 8.505388963629914e-06, + "loss": 0.7843, + "step": 2853 + }, + { + "epoch": 1.4504796391588846, + "grad_norm": 3.6621084213256836, + "learning_rate": 8.504189888270948e-06, + "loss": 0.8674, + "step": 2854 + }, + { + "epoch": 1.4509878660822058, + "grad_norm": 3.443359851837158, + "learning_rate": 8.502990416711796e-06, + "loss": 0.778, + "step": 2855 + }, + { + "epoch": 1.451496093005527, + "grad_norm": 3.2870068550109863, + "learning_rate": 8.501790549088074e-06, + "loss": 0.8024, + "step": 2856 + }, + { + "epoch": 1.4520043199288482, + "grad_norm": 3.1077282428741455, + "learning_rate": 8.500590285535447e-06, + "loss": 0.8335, + "step": 2857 + }, + { + "epoch": 1.4525125468521696, + "grad_norm": 3.2536587715148926, + "learning_rate": 8.499389626189622e-06, + "loss": 0.8781, + "step": 2858 + }, + { + "epoch": 1.4530207737754908, + "grad_norm": 3.109429359436035, + "learning_rate": 8.49818857118635e-06, + "loss": 0.8489, + "step": 2859 + }, + { + "epoch": 1.453529000698812, + "grad_norm": 3.064183235168457, + "learning_rate": 8.496987120661429e-06, + "loss": 0.8095, + "step": 2860 + }, + { + "epoch": 1.4540372276221332, + "grad_norm": 3.017165422439575, + "learning_rate": 8.495785274750698e-06, + "loss": 0.8582, + "step": 2861 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 3.174152374267578, + "learning_rate": 8.494583033590047e-06, + "loss": 0.7484, + "step": 2862 + }, + { + "epoch": 1.4550536814687758, + "grad_norm": 3.0165398120880127, + "learning_rate": 8.493380397315408e-06, + "loss": 0.8425, + "step": 2863 + }, + { + "epoch": 1.455561908392097, + "grad_norm": 3.5248165130615234, + "learning_rate": 8.49217736606275e-06, + "loss": 0.83, + "step": 2864 + }, + { + "epoch": 1.4560701353154184, + "grad_norm": 3.3429296016693115, + "learning_rate": 8.490973939968101e-06, + "loss": 0.8659, + "step": 2865 + }, + { + "epoch": 1.4565783622387396, + "grad_norm": 3.2521004676818848, + "learning_rate": 8.489770119167521e-06, + "loss": 0.8644, + "step": 2866 + }, + { + "epoch": 1.4570865891620608, + "grad_norm": 3.1303560733795166, + "learning_rate": 8.488565903797122e-06, + "loss": 0.9001, + "step": 2867 + }, + { + "epoch": 1.4575948160853822, + "grad_norm": 2.9541337490081787, + "learning_rate": 8.487361293993057e-06, + "loss": 0.8452, + "step": 2868 + }, + { + "epoch": 1.4581030430087034, + "grad_norm": 2.9469094276428223, + "learning_rate": 8.486156289891527e-06, + "loss": 0.804, + "step": 2869 + }, + { + "epoch": 1.4586112699320246, + "grad_norm": 3.3827242851257324, + "learning_rate": 8.484950891628774e-06, + "loss": 0.8085, + "step": 2870 + }, + { + "epoch": 1.459119496855346, + "grad_norm": 3.1991117000579834, + "learning_rate": 8.483745099341082e-06, + "loss": 0.8154, + "step": 2871 + }, + { + "epoch": 1.4596277237786672, + "grad_norm": 3.126009941101074, + "learning_rate": 8.482538913164792e-06, + "loss": 0.8419, + "step": 2872 + }, + { + "epoch": 1.4601359507019884, + "grad_norm": 3.3102211952209473, + "learning_rate": 8.481332333236275e-06, + "loss": 0.8628, + "step": 2873 + }, + { + "epoch": 1.4606441776253096, + "grad_norm": 3.188005208969116, + "learning_rate": 8.480125359691954e-06, + "loss": 0.9521, + "step": 2874 + }, + { + "epoch": 1.461152404548631, + "grad_norm": 3.1601901054382324, + "learning_rate": 8.478917992668295e-06, + "loss": 0.7734, + "step": 2875 + }, + { + "epoch": 1.4616606314719522, + "grad_norm": 3.1462960243225098, + "learning_rate": 8.477710232301809e-06, + "loss": 0.8857, + "step": 2876 + }, + { + "epoch": 1.4621688583952734, + "grad_norm": 3.0840206146240234, + "learning_rate": 8.476502078729049e-06, + "loss": 0.8253, + "step": 2877 + }, + { + "epoch": 1.4626770853185946, + "grad_norm": 3.2918813228607178, + "learning_rate": 8.47529353208662e-06, + "loss": 0.7815, + "step": 2878 + }, + { + "epoch": 1.463185312241916, + "grad_norm": 3.0587096214294434, + "learning_rate": 8.47408459251116e-06, + "loss": 0.8291, + "step": 2879 + }, + { + "epoch": 1.4636935391652373, + "grad_norm": 2.9685184955596924, + "learning_rate": 8.472875260139361e-06, + "loss": 0.8308, + "step": 2880 + }, + { + "epoch": 1.4642017660885585, + "grad_norm": 3.0110650062561035, + "learning_rate": 8.471665535107953e-06, + "loss": 0.8293, + "step": 2881 + }, + { + "epoch": 1.4647099930118799, + "grad_norm": 3.130685329437256, + "learning_rate": 8.470455417553716e-06, + "loss": 0.8487, + "step": 2882 + }, + { + "epoch": 1.465218219935201, + "grad_norm": 3.396280527114868, + "learning_rate": 8.46924490761347e-06, + "loss": 0.9272, + "step": 2883 + }, + { + "epoch": 1.4657264468585223, + "grad_norm": 3.0790679454803467, + "learning_rate": 8.468034005424081e-06, + "loss": 0.8587, + "step": 2884 + }, + { + "epoch": 1.4662346737818437, + "grad_norm": 3.0198047161102295, + "learning_rate": 8.46682271112246e-06, + "loss": 0.8687, + "step": 2885 + }, + { + "epoch": 1.4667429007051649, + "grad_norm": 3.0898425579071045, + "learning_rate": 8.465611024845561e-06, + "loss": 0.8936, + "step": 2886 + }, + { + "epoch": 1.467251127628486, + "grad_norm": 3.215315818786621, + "learning_rate": 8.464398946730383e-06, + "loss": 0.8631, + "step": 2887 + }, + { + "epoch": 1.4677593545518075, + "grad_norm": 3.161775827407837, + "learning_rate": 8.46318647691397e-06, + "loss": 0.8432, + "step": 2888 + }, + { + "epoch": 1.4682675814751287, + "grad_norm": 3.053117513656616, + "learning_rate": 8.461973615533409e-06, + "loss": 0.9322, + "step": 2889 + }, + { + "epoch": 1.46877580839845, + "grad_norm": 3.3006246089935303, + "learning_rate": 8.460760362725831e-06, + "loss": 0.8339, + "step": 2890 + }, + { + "epoch": 1.469284035321771, + "grad_norm": 3.0707836151123047, + "learning_rate": 8.459546718628412e-06, + "loss": 0.8493, + "step": 2891 + }, + { + "epoch": 1.4697922622450923, + "grad_norm": 3.0935218334198, + "learning_rate": 8.458332683378375e-06, + "loss": 0.8258, + "step": 2892 + }, + { + "epoch": 1.4703004891684137, + "grad_norm": 3.4484004974365234, + "learning_rate": 8.457118257112982e-06, + "loss": 0.8924, + "step": 2893 + }, + { + "epoch": 1.470808716091735, + "grad_norm": 3.459404706954956, + "learning_rate": 8.455903439969543e-06, + "loss": 0.8267, + "step": 2894 + }, + { + "epoch": 1.4713169430150561, + "grad_norm": 3.255765914916992, + "learning_rate": 8.454688232085409e-06, + "loss": 0.9236, + "step": 2895 + }, + { + "epoch": 1.4718251699383775, + "grad_norm": 3.0659914016723633, + "learning_rate": 8.45347263359798e-06, + "loss": 0.8843, + "step": 2896 + }, + { + "epoch": 1.4723333968616987, + "grad_norm": 2.9841461181640625, + "learning_rate": 8.452256644644694e-06, + "loss": 0.7879, + "step": 2897 + }, + { + "epoch": 1.47284162378502, + "grad_norm": 3.225430488586426, + "learning_rate": 8.451040265363039e-06, + "loss": 0.8594, + "step": 2898 + }, + { + "epoch": 1.4733498507083413, + "grad_norm": 3.0873258113861084, + "learning_rate": 8.449823495890546e-06, + "loss": 0.8681, + "step": 2899 + }, + { + "epoch": 1.4738580776316625, + "grad_norm": 2.978499174118042, + "learning_rate": 8.448606336364783e-06, + "loss": 0.8227, + "step": 2900 + }, + { + "epoch": 1.4743663045549837, + "grad_norm": 3.4347798824310303, + "learning_rate": 8.447388786923371e-06, + "loss": 0.9436, + "step": 2901 + }, + { + "epoch": 1.4748745314783052, + "grad_norm": 3.1734769344329834, + "learning_rate": 8.446170847703975e-06, + "loss": 0.8, + "step": 2902 + }, + { + "epoch": 1.4753827584016264, + "grad_norm": 2.9005730152130127, + "learning_rate": 8.444952518844297e-06, + "loss": 0.879, + "step": 2903 + }, + { + "epoch": 1.4758909853249476, + "grad_norm": 3.3382294178009033, + "learning_rate": 8.443733800482089e-06, + "loss": 0.9734, + "step": 2904 + }, + { + "epoch": 1.476399212248269, + "grad_norm": 2.981613874435425, + "learning_rate": 8.442514692755141e-06, + "loss": 0.9232, + "step": 2905 + }, + { + "epoch": 1.4769074391715902, + "grad_norm": 3.060418128967285, + "learning_rate": 8.441295195801296e-06, + "loss": 0.8169, + "step": 2906 + }, + { + "epoch": 1.4774156660949114, + "grad_norm": 3.258392095565796, + "learning_rate": 8.440075309758433e-06, + "loss": 0.7951, + "step": 2907 + }, + { + "epoch": 1.4779238930182326, + "grad_norm": 3.1214146614074707, + "learning_rate": 8.438855034764482e-06, + "loss": 0.8439, + "step": 2908 + }, + { + "epoch": 1.4784321199415538, + "grad_norm": 3.0851261615753174, + "learning_rate": 8.437634370957407e-06, + "loss": 0.9226, + "step": 2909 + }, + { + "epoch": 1.4789403468648752, + "grad_norm": 3.002401351928711, + "learning_rate": 8.436413318475227e-06, + "loss": 0.7845, + "step": 2910 + }, + { + "epoch": 1.4794485737881964, + "grad_norm": 2.99877667427063, + "learning_rate": 8.435191877455998e-06, + "loss": 0.8346, + "step": 2911 + }, + { + "epoch": 1.4799568007115176, + "grad_norm": 3.067758321762085, + "learning_rate": 8.43397004803782e-06, + "loss": 0.8056, + "step": 2912 + }, + { + "epoch": 1.480465027634839, + "grad_norm": 3.270920515060425, + "learning_rate": 8.432747830358843e-06, + "loss": 0.8406, + "step": 2913 + }, + { + "epoch": 1.4809732545581602, + "grad_norm": 3.130580186843872, + "learning_rate": 8.431525224557252e-06, + "loss": 0.8509, + "step": 2914 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 3.3330612182617188, + "learning_rate": 8.430302230771287e-06, + "loss": 0.8677, + "step": 2915 + }, + { + "epoch": 1.4819897084048028, + "grad_norm": 3.016632318496704, + "learning_rate": 8.42907884913922e-06, + "loss": 0.7927, + "step": 2916 + }, + { + "epoch": 1.482497935328124, + "grad_norm": 3.3111484050750732, + "learning_rate": 8.427855079799372e-06, + "loss": 0.8822, + "step": 2917 + }, + { + "epoch": 1.4830061622514452, + "grad_norm": 3.247408628463745, + "learning_rate": 8.426630922890111e-06, + "loss": 0.905, + "step": 2918 + }, + { + "epoch": 1.4835143891747666, + "grad_norm": 2.9573397636413574, + "learning_rate": 8.425406378549845e-06, + "loss": 0.8445, + "step": 2919 + }, + { + "epoch": 1.4840226160980878, + "grad_norm": 3.0608110427856445, + "learning_rate": 8.424181446917025e-06, + "loss": 0.7899, + "step": 2920 + }, + { + "epoch": 1.484530843021409, + "grad_norm": 3.070166826248169, + "learning_rate": 8.422956128130152e-06, + "loss": 0.8312, + "step": 2921 + }, + { + "epoch": 1.4850390699447304, + "grad_norm": 3.365817070007324, + "learning_rate": 8.421730422327761e-06, + "loss": 0.8399, + "step": 2922 + }, + { + "epoch": 1.4855472968680516, + "grad_norm": 3.1153318881988525, + "learning_rate": 8.42050432964844e-06, + "loss": 0.8013, + "step": 2923 + }, + { + "epoch": 1.4860555237913728, + "grad_norm": 3.2523930072784424, + "learning_rate": 8.419277850230813e-06, + "loss": 0.8811, + "step": 2924 + }, + { + "epoch": 1.486563750714694, + "grad_norm": 3.05375599861145, + "learning_rate": 8.418050984213556e-06, + "loss": 0.882, + "step": 2925 + }, + { + "epoch": 1.4870719776380152, + "grad_norm": 3.3024351596832275, + "learning_rate": 8.41682373173538e-06, + "loss": 0.9168, + "step": 2926 + }, + { + "epoch": 1.4875802045613367, + "grad_norm": 3.0616862773895264, + "learning_rate": 8.415596092935047e-06, + "loss": 0.841, + "step": 2927 + }, + { + "epoch": 1.4880884314846579, + "grad_norm": 3.1600990295410156, + "learning_rate": 8.41436806795136e-06, + "loss": 0.8187, + "step": 2928 + }, + { + "epoch": 1.488596658407979, + "grad_norm": 3.2013626098632812, + "learning_rate": 8.413139656923162e-06, + "loss": 0.8933, + "step": 2929 + }, + { + "epoch": 1.4891048853313005, + "grad_norm": 3.221249580383301, + "learning_rate": 8.411910859989345e-06, + "loss": 0.8945, + "step": 2930 + }, + { + "epoch": 1.4896131122546217, + "grad_norm": 3.0507285594940186, + "learning_rate": 8.410681677288843e-06, + "loss": 0.934, + "step": 2931 + }, + { + "epoch": 1.4901213391779429, + "grad_norm": 3.444394111633301, + "learning_rate": 8.409452108960631e-06, + "loss": 0.8934, + "step": 2932 + }, + { + "epoch": 1.4906295661012643, + "grad_norm": 3.080002546310425, + "learning_rate": 8.408222155143732e-06, + "loss": 0.7693, + "step": 2933 + }, + { + "epoch": 1.4911377930245855, + "grad_norm": 3.0022099018096924, + "learning_rate": 8.40699181597721e-06, + "loss": 0.8172, + "step": 2934 + }, + { + "epoch": 1.4916460199479067, + "grad_norm": 2.9647133350372314, + "learning_rate": 8.405761091600172e-06, + "loss": 0.9459, + "step": 2935 + }, + { + "epoch": 1.492154246871228, + "grad_norm": 2.958550453186035, + "learning_rate": 8.404529982151772e-06, + "loss": 0.8155, + "step": 2936 + }, + { + "epoch": 1.4926624737945493, + "grad_norm": 2.8132691383361816, + "learning_rate": 8.403298487771201e-06, + "loss": 0.7531, + "step": 2937 + }, + { + "epoch": 1.4931707007178705, + "grad_norm": 3.3202908039093018, + "learning_rate": 8.4020666085977e-06, + "loss": 0.9386, + "step": 2938 + }, + { + "epoch": 1.493678927641192, + "grad_norm": 3.345435857772827, + "learning_rate": 8.40083434477055e-06, + "loss": 0.9833, + "step": 2939 + }, + { + "epoch": 1.4941871545645131, + "grad_norm": 3.2024502754211426, + "learning_rate": 8.399601696429077e-06, + "loss": 0.8559, + "step": 2940 + }, + { + "epoch": 1.4946953814878343, + "grad_norm": 3.3189926147460938, + "learning_rate": 8.398368663712652e-06, + "loss": 0.8808, + "step": 2941 + }, + { + "epoch": 1.4952036084111555, + "grad_norm": 3.0005111694335938, + "learning_rate": 8.397135246760686e-06, + "loss": 0.8676, + "step": 2942 + }, + { + "epoch": 1.4957118353344767, + "grad_norm": 2.9679107666015625, + "learning_rate": 8.395901445712635e-06, + "loss": 0.7782, + "step": 2943 + }, + { + "epoch": 1.4962200622577981, + "grad_norm": 3.023895263671875, + "learning_rate": 8.394667260707996e-06, + "loss": 0.8329, + "step": 2944 + }, + { + "epoch": 1.4967282891811193, + "grad_norm": 2.946505069732666, + "learning_rate": 8.393432691886314e-06, + "loss": 0.7313, + "step": 2945 + }, + { + "epoch": 1.4972365161044405, + "grad_norm": 2.7999486923217773, + "learning_rate": 8.392197739387175e-06, + "loss": 0.8184, + "step": 2946 + }, + { + "epoch": 1.497744743027762, + "grad_norm": 3.1402924060821533, + "learning_rate": 8.390962403350209e-06, + "loss": 0.843, + "step": 2947 + }, + { + "epoch": 1.4982529699510831, + "grad_norm": 3.1389057636260986, + "learning_rate": 8.389726683915088e-06, + "loss": 0.9186, + "step": 2948 + }, + { + "epoch": 1.4987611968744043, + "grad_norm": 2.9966344833374023, + "learning_rate": 8.388490581221529e-06, + "loss": 0.8748, + "step": 2949 + }, + { + "epoch": 1.4992694237977258, + "grad_norm": 3.105550527572632, + "learning_rate": 8.387254095409289e-06, + "loss": 0.8893, + "step": 2950 + }, + { + "epoch": 1.499777650721047, + "grad_norm": 3.089803695678711, + "learning_rate": 8.386017226618175e-06, + "loss": 0.8809, + "step": 2951 + }, + { + "epoch": 1.5002858776443682, + "grad_norm": 3.3688395023345947, + "learning_rate": 8.38477997498803e-06, + "loss": 0.8093, + "step": 2952 + }, + { + "epoch": 1.5007941045676896, + "grad_norm": 3.1366262435913086, + "learning_rate": 8.383542340658749e-06, + "loss": 0.9673, + "step": 2953 + }, + { + "epoch": 1.5013023314910108, + "grad_norm": 3.131044387817383, + "learning_rate": 8.382304323770257e-06, + "loss": 0.9301, + "step": 2954 + }, + { + "epoch": 1.501810558414332, + "grad_norm": 3.0539796352386475, + "learning_rate": 8.381065924462532e-06, + "loss": 0.9085, + "step": 2955 + }, + { + "epoch": 1.5023187853376534, + "grad_norm": 3.356163263320923, + "learning_rate": 8.379827142875598e-06, + "loss": 0.8581, + "step": 2956 + }, + { + "epoch": 1.5028270122609744, + "grad_norm": 3.249194622039795, + "learning_rate": 8.378587979149512e-06, + "loss": 0.8807, + "step": 2957 + }, + { + "epoch": 1.5033352391842958, + "grad_norm": 3.210223913192749, + "learning_rate": 8.377348433424382e-06, + "loss": 0.875, + "step": 2958 + }, + { + "epoch": 1.5038434661076172, + "grad_norm": 2.936296224594116, + "learning_rate": 8.37610850584036e-06, + "loss": 0.7714, + "step": 2959 + }, + { + "epoch": 1.5043516930309382, + "grad_norm": 3.063220262527466, + "learning_rate": 8.374868196537632e-06, + "loss": 0.8493, + "step": 2960 + }, + { + "epoch": 1.5048599199542596, + "grad_norm": 2.9019317626953125, + "learning_rate": 8.373627505656434e-06, + "loss": 0.8043, + "step": 2961 + }, + { + "epoch": 1.5053681468775808, + "grad_norm": 3.295156717300415, + "learning_rate": 8.37238643333705e-06, + "loss": 0.9071, + "step": 2962 + }, + { + "epoch": 1.505876373800902, + "grad_norm": 3.10031795501709, + "learning_rate": 8.371144979719797e-06, + "loss": 0.8211, + "step": 2963 + }, + { + "epoch": 1.5063846007242234, + "grad_norm": 3.311487913131714, + "learning_rate": 8.36990314494504e-06, + "loss": 0.9032, + "step": 2964 + }, + { + "epoch": 1.5068928276475446, + "grad_norm": 3.106748580932617, + "learning_rate": 8.368660929153187e-06, + "loss": 0.8927, + "step": 2965 + }, + { + "epoch": 1.5074010545708658, + "grad_norm": 3.0898537635803223, + "learning_rate": 8.367418332484689e-06, + "loss": 0.8918, + "step": 2966 + }, + { + "epoch": 1.5079092814941872, + "grad_norm": 3.2117109298706055, + "learning_rate": 8.36617535508004e-06, + "loss": 0.8505, + "step": 2967 + }, + { + "epoch": 1.5084175084175084, + "grad_norm": 3.125581979751587, + "learning_rate": 8.364931997079775e-06, + "loss": 0.9883, + "step": 2968 + }, + { + "epoch": 1.5089257353408296, + "grad_norm": 3.275686502456665, + "learning_rate": 8.363688258624478e-06, + "loss": 0.8197, + "step": 2969 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 3.1875977516174316, + "learning_rate": 8.362444139854767e-06, + "loss": 0.8912, + "step": 2970 + }, + { + "epoch": 1.5099421891874723, + "grad_norm": 3.183387279510498, + "learning_rate": 8.361199640911311e-06, + "loss": 0.8201, + "step": 2971 + }, + { + "epoch": 1.5104504161107934, + "grad_norm": 3.1798882484436035, + "learning_rate": 8.35995476193482e-06, + "loss": 0.8789, + "step": 2972 + }, + { + "epoch": 1.5109586430341149, + "grad_norm": 3.138533353805542, + "learning_rate": 8.358709503066042e-06, + "loss": 0.8732, + "step": 2973 + }, + { + "epoch": 1.5114668699574358, + "grad_norm": 3.4618101119995117, + "learning_rate": 8.357463864445774e-06, + "loss": 0.8354, + "step": 2974 + }, + { + "epoch": 1.5119750968807573, + "grad_norm": 3.1000592708587646, + "learning_rate": 8.356217846214855e-06, + "loss": 0.7872, + "step": 2975 + }, + { + "epoch": 1.5124833238040787, + "grad_norm": 3.0090417861938477, + "learning_rate": 8.354971448514164e-06, + "loss": 0.8379, + "step": 2976 + }, + { + "epoch": 1.5129915507273997, + "grad_norm": 2.9547312259674072, + "learning_rate": 8.353724671484624e-06, + "loss": 0.7905, + "step": 2977 + }, + { + "epoch": 1.513499777650721, + "grad_norm": 3.2382640838623047, + "learning_rate": 8.352477515267203e-06, + "loss": 0.8356, + "step": 2978 + }, + { + "epoch": 1.5140080045740423, + "grad_norm": 2.9780771732330322, + "learning_rate": 8.35122998000291e-06, + "loss": 0.8185, + "step": 2979 + }, + { + "epoch": 1.5145162314973635, + "grad_norm": 3.149280309677124, + "learning_rate": 8.349982065832797e-06, + "loss": 0.7817, + "step": 2980 + }, + { + "epoch": 1.515024458420685, + "grad_norm": 3.0026772022247314, + "learning_rate": 8.34873377289796e-06, + "loss": 0.8866, + "step": 2981 + }, + { + "epoch": 1.515532685344006, + "grad_norm": 3.194310188293457, + "learning_rate": 8.347485101339533e-06, + "loss": 0.8655, + "step": 2982 + }, + { + "epoch": 1.5160409122673273, + "grad_norm": 3.2000746726989746, + "learning_rate": 8.3462360512987e-06, + "loss": 0.8921, + "step": 2983 + }, + { + "epoch": 1.5165491391906487, + "grad_norm": 3.400982141494751, + "learning_rate": 8.344986622916685e-06, + "loss": 0.8467, + "step": 2984 + }, + { + "epoch": 1.51705736611397, + "grad_norm": 2.931072235107422, + "learning_rate": 8.343736816334755e-06, + "loss": 0.834, + "step": 2985 + }, + { + "epoch": 1.517565593037291, + "grad_norm": 3.178807497024536, + "learning_rate": 8.342486631694216e-06, + "loss": 0.9266, + "step": 2986 + }, + { + "epoch": 1.5180738199606125, + "grad_norm": 3.3427088260650635, + "learning_rate": 8.341236069136419e-06, + "loss": 0.8043, + "step": 2987 + }, + { + "epoch": 1.5185820468839337, + "grad_norm": 3.239030599594116, + "learning_rate": 8.339985128802763e-06, + "loss": 0.945, + "step": 2988 + }, + { + "epoch": 1.519090273807255, + "grad_norm": 3.4419260025024414, + "learning_rate": 8.33873381083468e-06, + "loss": 0.8667, + "step": 2989 + }, + { + "epoch": 1.5195985007305763, + "grad_norm": 3.0164976119995117, + "learning_rate": 8.337482115373655e-06, + "loss": 0.839, + "step": 2990 + }, + { + "epoch": 1.5201067276538973, + "grad_norm": 2.8095803260803223, + "learning_rate": 8.336230042561209e-06, + "loss": 0.7806, + "step": 2991 + }, + { + "epoch": 1.5206149545772187, + "grad_norm": 3.120523452758789, + "learning_rate": 8.334977592538904e-06, + "loss": 0.8523, + "step": 2992 + }, + { + "epoch": 1.5211231815005402, + "grad_norm": 3.2824933528900146, + "learning_rate": 8.333724765448352e-06, + "loss": 0.8601, + "step": 2993 + }, + { + "epoch": 1.5216314084238611, + "grad_norm": 3.133676767349243, + "learning_rate": 8.3324715614312e-06, + "loss": 0.8269, + "step": 2994 + }, + { + "epoch": 1.5221396353471826, + "grad_norm": 3.2283775806427, + "learning_rate": 8.331217980629144e-06, + "loss": 0.9106, + "step": 2995 + }, + { + "epoch": 1.5226478622705037, + "grad_norm": 3.171283483505249, + "learning_rate": 8.329964023183918e-06, + "loss": 0.8629, + "step": 2996 + }, + { + "epoch": 1.523156089193825, + "grad_norm": 3.0246291160583496, + "learning_rate": 8.328709689237303e-06, + "loss": 0.8226, + "step": 2997 + }, + { + "epoch": 1.5236643161171464, + "grad_norm": 3.2844457626342773, + "learning_rate": 8.327454978931117e-06, + "loss": 0.8238, + "step": 2998 + }, + { + "epoch": 1.5241725430404676, + "grad_norm": 3.1582090854644775, + "learning_rate": 8.326199892407222e-06, + "loss": 0.8133, + "step": 2999 + }, + { + "epoch": 1.5246807699637888, + "grad_norm": 2.971914768218994, + "learning_rate": 8.32494442980753e-06, + "loss": 0.7771, + "step": 3000 + }, + { + "epoch": 1.5246807699637888, + "eval_loss": 1.2642887830734253, + "eval_runtime": 14.481, + "eval_samples_per_second": 27.622, + "eval_steps_per_second": 3.453, + "step": 3000 + }, + { + "epoch": 1.5251889968871102, + "grad_norm": 3.0078349113464355, + "learning_rate": 8.323688591273983e-06, + "loss": 0.8273, + "step": 3001 + }, + { + "epoch": 1.5256972238104314, + "grad_norm": 2.915525436401367, + "learning_rate": 8.322432376948577e-06, + "loss": 0.8111, + "step": 3002 + }, + { + "epoch": 1.5262054507337526, + "grad_norm": 3.245734930038452, + "learning_rate": 8.321175786973343e-06, + "loss": 0.8522, + "step": 3003 + }, + { + "epoch": 1.526713677657074, + "grad_norm": 3.0924389362335205, + "learning_rate": 8.319918821490358e-06, + "loss": 0.9071, + "step": 3004 + }, + { + "epoch": 1.5272219045803952, + "grad_norm": 3.2382094860076904, + "learning_rate": 8.318661480641738e-06, + "loss": 0.7896, + "step": 3005 + }, + { + "epoch": 1.5277301315037164, + "grad_norm": 3.118859052658081, + "learning_rate": 8.317403764569646e-06, + "loss": 0.841, + "step": 3006 + }, + { + "epoch": 1.5282383584270378, + "grad_norm": 3.158026695251465, + "learning_rate": 8.316145673416285e-06, + "loss": 0.862, + "step": 3007 + }, + { + "epoch": 1.5287465853503588, + "grad_norm": 3.2535459995269775, + "learning_rate": 8.3148872073239e-06, + "loss": 0.8305, + "step": 3008 + }, + { + "epoch": 1.5292548122736802, + "grad_norm": 2.9503650665283203, + "learning_rate": 8.31362836643478e-06, + "loss": 0.911, + "step": 3009 + }, + { + "epoch": 1.5297630391970014, + "grad_norm": 3.5011672973632812, + "learning_rate": 8.312369150891256e-06, + "loss": 0.8192, + "step": 3010 + }, + { + "epoch": 1.5302712661203226, + "grad_norm": 3.1151344776153564, + "learning_rate": 8.3111095608357e-06, + "loss": 0.8384, + "step": 3011 + }, + { + "epoch": 1.530779493043644, + "grad_norm": 3.046571731567383, + "learning_rate": 8.309849596410527e-06, + "loss": 0.7742, + "step": 3012 + }, + { + "epoch": 1.5312877199669652, + "grad_norm": 3.1235508918762207, + "learning_rate": 8.308589257758194e-06, + "loss": 0.8431, + "step": 3013 + }, + { + "epoch": 1.5317959468902864, + "grad_norm": 3.450984477996826, + "learning_rate": 8.307328545021203e-06, + "loss": 0.8558, + "step": 3014 + }, + { + "epoch": 1.5323041738136078, + "grad_norm": 3.317640542984009, + "learning_rate": 8.306067458342092e-06, + "loss": 0.7204, + "step": 3015 + }, + { + "epoch": 1.532812400736929, + "grad_norm": 3.245126247406006, + "learning_rate": 8.304805997863453e-06, + "loss": 0.8786, + "step": 3016 + }, + { + "epoch": 1.5333206276602502, + "grad_norm": 3.327097177505493, + "learning_rate": 8.303544163727904e-06, + "loss": 0.8458, + "step": 3017 + }, + { + "epoch": 1.5338288545835717, + "grad_norm": 3.1399662494659424, + "learning_rate": 8.302281956078117e-06, + "loss": 0.7665, + "step": 3018 + }, + { + "epoch": 1.5343370815068929, + "grad_norm": 3.164243698120117, + "learning_rate": 8.301019375056805e-06, + "loss": 0.7948, + "step": 3019 + }, + { + "epoch": 1.534845308430214, + "grad_norm": 3.5101428031921387, + "learning_rate": 8.29975642080672e-06, + "loss": 0.9736, + "step": 3020 + }, + { + "epoch": 1.5353535353535355, + "grad_norm": 3.018258810043335, + "learning_rate": 8.298493093470656e-06, + "loss": 0.8181, + "step": 3021 + }, + { + "epoch": 1.5358617622768567, + "grad_norm": 3.4201853275299072, + "learning_rate": 8.297229393191454e-06, + "loss": 0.8984, + "step": 3022 + }, + { + "epoch": 1.5363699892001779, + "grad_norm": 2.9878666400909424, + "learning_rate": 8.295965320111993e-06, + "loss": 0.8458, + "step": 3023 + }, + { + "epoch": 1.5368782161234993, + "grad_norm": 3.21189022064209, + "learning_rate": 8.294700874375192e-06, + "loss": 0.803, + "step": 3024 + }, + { + "epoch": 1.5373864430468203, + "grad_norm": 3.2621307373046875, + "learning_rate": 8.29343605612402e-06, + "loss": 0.9049, + "step": 3025 + }, + { + "epoch": 1.5378946699701417, + "grad_norm": 3.1909806728363037, + "learning_rate": 8.292170865501479e-06, + "loss": 0.9027, + "step": 3026 + }, + { + "epoch": 1.5384028968934629, + "grad_norm": 2.886561870574951, + "learning_rate": 8.29090530265062e-06, + "loss": 0.8333, + "step": 3027 + }, + { + "epoch": 1.538911123816784, + "grad_norm": 3.039076566696167, + "learning_rate": 8.28963936771453e-06, + "loss": 0.8114, + "step": 3028 + }, + { + "epoch": 1.5394193507401055, + "grad_norm": 3.1542789936065674, + "learning_rate": 8.288373060836347e-06, + "loss": 0.8028, + "step": 3029 + }, + { + "epoch": 1.5399275776634267, + "grad_norm": 3.1072874069213867, + "learning_rate": 8.287106382159242e-06, + "loss": 0.8745, + "step": 3030 + }, + { + "epoch": 1.540435804586748, + "grad_norm": 3.9167263507843018, + "learning_rate": 8.285839331826432e-06, + "loss": 0.9285, + "step": 3031 + }, + { + "epoch": 1.5409440315100693, + "grad_norm": 3.416506290435791, + "learning_rate": 8.28457190998118e-06, + "loss": 0.9308, + "step": 3032 + }, + { + "epoch": 1.5414522584333905, + "grad_norm": 3.403721332550049, + "learning_rate": 8.283304116766777e-06, + "loss": 0.8827, + "step": 3033 + }, + { + "epoch": 1.5419604853567117, + "grad_norm": 2.909219264984131, + "learning_rate": 8.282035952326575e-06, + "loss": 0.7463, + "step": 3034 + }, + { + "epoch": 1.5424687122800331, + "grad_norm": 3.1260173320770264, + "learning_rate": 8.280767416803953e-06, + "loss": 0.8301, + "step": 3035 + }, + { + "epoch": 1.5429769392033543, + "grad_norm": 3.044611692428589, + "learning_rate": 8.27949851034234e-06, + "loss": 0.8554, + "step": 3036 + }, + { + "epoch": 1.5434851661266755, + "grad_norm": 3.3264572620391846, + "learning_rate": 8.278229233085206e-06, + "loss": 0.9276, + "step": 3037 + }, + { + "epoch": 1.543993393049997, + "grad_norm": 3.1489923000335693, + "learning_rate": 8.276959585176059e-06, + "loss": 0.8785, + "step": 3038 + }, + { + "epoch": 1.5445016199733181, + "grad_norm": 3.221567153930664, + "learning_rate": 8.275689566758452e-06, + "loss": 0.9196, + "step": 3039 + }, + { + "epoch": 1.5450098468966393, + "grad_norm": 2.85846209526062, + "learning_rate": 8.274419177975978e-06, + "loss": 0.7357, + "step": 3040 + }, + { + "epoch": 1.5455180738199608, + "grad_norm": 3.177860975265503, + "learning_rate": 8.273148418972276e-06, + "loss": 0.8897, + "step": 3041 + }, + { + "epoch": 1.5460263007432817, + "grad_norm": 2.943847894668579, + "learning_rate": 8.271877289891022e-06, + "loss": 0.8209, + "step": 3042 + }, + { + "epoch": 1.5465345276666032, + "grad_norm": 2.898120164871216, + "learning_rate": 8.270605790875936e-06, + "loss": 0.849, + "step": 3043 + }, + { + "epoch": 1.5470427545899244, + "grad_norm": 3.1277554035186768, + "learning_rate": 8.269333922070779e-06, + "loss": 0.8751, + "step": 3044 + }, + { + "epoch": 1.5475509815132455, + "grad_norm": 3.0100021362304688, + "learning_rate": 8.268061683619354e-06, + "loss": 0.7681, + "step": 3045 + }, + { + "epoch": 1.548059208436567, + "grad_norm": 3.272531509399414, + "learning_rate": 8.266789075665513e-06, + "loss": 0.9174, + "step": 3046 + }, + { + "epoch": 1.5485674353598882, + "grad_norm": 3.1157844066619873, + "learning_rate": 8.265516098353134e-06, + "loss": 0.8402, + "step": 3047 + }, + { + "epoch": 1.5490756622832094, + "grad_norm": 3.2872796058654785, + "learning_rate": 8.264242751826149e-06, + "loss": 0.8969, + "step": 3048 + }, + { + "epoch": 1.5495838892065308, + "grad_norm": 2.835674285888672, + "learning_rate": 8.26296903622853e-06, + "loss": 0.8268, + "step": 3049 + }, + { + "epoch": 1.550092116129852, + "grad_norm": 3.2123286724090576, + "learning_rate": 8.26169495170429e-06, + "loss": 0.871, + "step": 3050 + }, + { + "epoch": 1.5506003430531732, + "grad_norm": 3.2385337352752686, + "learning_rate": 8.260420498397477e-06, + "loss": 0.95, + "step": 3051 + }, + { + "epoch": 1.5511085699764946, + "grad_norm": 3.034102439880371, + "learning_rate": 8.259145676452196e-06, + "loss": 0.8378, + "step": 3052 + }, + { + "epoch": 1.5516167968998158, + "grad_norm": 3.435119867324829, + "learning_rate": 8.257870486012574e-06, + "loss": 0.9189, + "step": 3053 + }, + { + "epoch": 1.552125023823137, + "grad_norm": 2.852510929107666, + "learning_rate": 8.256594927222798e-06, + "loss": 0.7759, + "step": 3054 + }, + { + "epoch": 1.5526332507464584, + "grad_norm": 3.141561269760132, + "learning_rate": 8.255319000227087e-06, + "loss": 0.8407, + "step": 3055 + }, + { + "epoch": 1.5531414776697794, + "grad_norm": 3.120166778564453, + "learning_rate": 8.254042705169702e-06, + "loss": 0.8263, + "step": 3056 + }, + { + "epoch": 1.5536497045931008, + "grad_norm": 3.157909393310547, + "learning_rate": 8.252766042194947e-06, + "loss": 0.8824, + "step": 3057 + }, + { + "epoch": 1.5541579315164222, + "grad_norm": 3.0600900650024414, + "learning_rate": 8.251489011447166e-06, + "loss": 0.7545, + "step": 3058 + }, + { + "epoch": 1.5546661584397432, + "grad_norm": 3.2997310161590576, + "learning_rate": 8.25021161307075e-06, + "loss": 0.9094, + "step": 3059 + }, + { + "epoch": 1.5551743853630646, + "grad_norm": 3.1490283012390137, + "learning_rate": 8.248933847210125e-06, + "loss": 0.7762, + "step": 3060 + }, + { + "epoch": 1.5556826122863858, + "grad_norm": 3.1866819858551025, + "learning_rate": 8.247655714009761e-06, + "loss": 0.77, + "step": 3061 + }, + { + "epoch": 1.556190839209707, + "grad_norm": 3.3561694622039795, + "learning_rate": 8.246377213614172e-06, + "loss": 0.8339, + "step": 3062 + }, + { + "epoch": 1.5566990661330284, + "grad_norm": 3.224182605743408, + "learning_rate": 8.245098346167908e-06, + "loss": 0.9327, + "step": 3063 + }, + { + "epoch": 1.5572072930563496, + "grad_norm": 3.1291093826293945, + "learning_rate": 8.243819111815567e-06, + "loss": 0.8927, + "step": 3064 + }, + { + "epoch": 1.5577155199796708, + "grad_norm": 5.050314426422119, + "learning_rate": 8.242539510701784e-06, + "loss": 0.8154, + "step": 3065 + }, + { + "epoch": 1.5582237469029923, + "grad_norm": 3.3334028720855713, + "learning_rate": 8.241259542971234e-06, + "loss": 0.8359, + "step": 3066 + }, + { + "epoch": 1.5587319738263135, + "grad_norm": 3.098841428756714, + "learning_rate": 8.23997920876864e-06, + "loss": 0.8848, + "step": 3067 + }, + { + "epoch": 1.5592402007496347, + "grad_norm": 3.003560781478882, + "learning_rate": 8.238698508238763e-06, + "loss": 0.8935, + "step": 3068 + }, + { + "epoch": 1.559748427672956, + "grad_norm": 4.89196252822876, + "learning_rate": 8.237417441526401e-06, + "loss": 0.8448, + "step": 3069 + }, + { + "epoch": 1.5602566545962773, + "grad_norm": 3.1076719760894775, + "learning_rate": 8.2361360087764e-06, + "loss": 0.7736, + "step": 3070 + }, + { + "epoch": 1.5607648815195985, + "grad_norm": 3.310075521469116, + "learning_rate": 8.234854210133647e-06, + "loss": 0.8718, + "step": 3071 + }, + { + "epoch": 1.5612731084429199, + "grad_norm": 3.2055442333221436, + "learning_rate": 8.233572045743064e-06, + "loss": 0.8538, + "step": 3072 + }, + { + "epoch": 1.5617813353662409, + "grad_norm": 3.108445644378662, + "learning_rate": 8.23228951574962e-06, + "loss": 0.863, + "step": 3073 + }, + { + "epoch": 1.5622895622895623, + "grad_norm": 3.3221216201782227, + "learning_rate": 8.231006620298324e-06, + "loss": 0.8715, + "step": 3074 + }, + { + "epoch": 1.5627977892128837, + "grad_norm": 3.3187458515167236, + "learning_rate": 8.229723359534227e-06, + "loss": 0.8981, + "step": 3075 + }, + { + "epoch": 1.5633060161362047, + "grad_norm": 3.0759851932525635, + "learning_rate": 8.228439733602417e-06, + "loss": 0.7856, + "step": 3076 + }, + { + "epoch": 1.563814243059526, + "grad_norm": 3.011303186416626, + "learning_rate": 8.227155742648034e-06, + "loss": 0.8163, + "step": 3077 + }, + { + "epoch": 1.5643224699828473, + "grad_norm": 3.2420897483825684, + "learning_rate": 8.225871386816246e-06, + "loss": 0.8399, + "step": 3078 + }, + { + "epoch": 1.5648306969061685, + "grad_norm": 3.1554501056671143, + "learning_rate": 8.22458666625227e-06, + "loss": 0.8572, + "step": 3079 + }, + { + "epoch": 1.56533892382949, + "grad_norm": 3.1208579540252686, + "learning_rate": 8.223301581101362e-06, + "loss": 0.894, + "step": 3080 + }, + { + "epoch": 1.5658471507528111, + "grad_norm": 3.216609001159668, + "learning_rate": 8.222016131508822e-06, + "loss": 0.7723, + "step": 3081 + }, + { + "epoch": 1.5663553776761323, + "grad_norm": 3.1499931812286377, + "learning_rate": 8.220730317619984e-06, + "loss": 0.7767, + "step": 3082 + }, + { + "epoch": 1.5668636045994537, + "grad_norm": 3.308377742767334, + "learning_rate": 8.219444139580233e-06, + "loss": 0.8795, + "step": 3083 + }, + { + "epoch": 1.567371831522775, + "grad_norm": 3.081089735031128, + "learning_rate": 8.218157597534989e-06, + "loss": 0.7532, + "step": 3084 + }, + { + "epoch": 1.5678800584460961, + "grad_norm": 3.2779386043548584, + "learning_rate": 8.216870691629715e-06, + "loss": 0.8305, + "step": 3085 + }, + { + "epoch": 1.5683882853694175, + "grad_norm": 3.1625919342041016, + "learning_rate": 8.215583422009912e-06, + "loss": 0.8548, + "step": 3086 + }, + { + "epoch": 1.5688965122927387, + "grad_norm": 3.231231451034546, + "learning_rate": 8.214295788821128e-06, + "loss": 0.8647, + "step": 3087 + }, + { + "epoch": 1.56940473921606, + "grad_norm": 3.0235724449157715, + "learning_rate": 8.213007792208946e-06, + "loss": 0.8357, + "step": 3088 + }, + { + "epoch": 1.5699129661393814, + "grad_norm": 3.2855448722839355, + "learning_rate": 8.211719432318996e-06, + "loss": 0.8629, + "step": 3089 + }, + { + "epoch": 1.5704211930627023, + "grad_norm": 3.349738121032715, + "learning_rate": 8.210430709296946e-06, + "loss": 0.8685, + "step": 3090 + }, + { + "epoch": 1.5709294199860238, + "grad_norm": 3.026463031768799, + "learning_rate": 8.209141623288501e-06, + "loss": 0.8174, + "step": 3091 + }, + { + "epoch": 1.5714376469093452, + "grad_norm": 3.2298712730407715, + "learning_rate": 8.207852174439415e-06, + "loss": 0.8269, + "step": 3092 + }, + { + "epoch": 1.5719458738326662, + "grad_norm": 3.0465500354766846, + "learning_rate": 8.206562362895476e-06, + "loss": 0.8116, + "step": 3093 + }, + { + "epoch": 1.5724541007559876, + "grad_norm": 3.303372859954834, + "learning_rate": 8.20527218880252e-06, + "loss": 0.8121, + "step": 3094 + }, + { + "epoch": 1.5729623276793088, + "grad_norm": 3.1203267574310303, + "learning_rate": 8.203981652306418e-06, + "loss": 0.7643, + "step": 3095 + }, + { + "epoch": 1.57347055460263, + "grad_norm": 3.2606565952301025, + "learning_rate": 8.202690753553083e-06, + "loss": 0.8244, + "step": 3096 + }, + { + "epoch": 1.5739787815259514, + "grad_norm": 3.0706636905670166, + "learning_rate": 8.201399492688474e-06, + "loss": 0.8284, + "step": 3097 + }, + { + "epoch": 1.5744870084492726, + "grad_norm": 3.146022081375122, + "learning_rate": 8.20010786985858e-06, + "loss": 0.9854, + "step": 3098 + }, + { + "epoch": 1.5749952353725938, + "grad_norm": 3.0561680793762207, + "learning_rate": 8.198815885209445e-06, + "loss": 0.8211, + "step": 3099 + }, + { + "epoch": 1.5755034622959152, + "grad_norm": 3.139600992202759, + "learning_rate": 8.197523538887144e-06, + "loss": 0.7939, + "step": 3100 + }, + { + "epoch": 1.5760116892192364, + "grad_norm": 3.0058977603912354, + "learning_rate": 8.196230831037797e-06, + "loss": 0.7286, + "step": 3101 + }, + { + "epoch": 1.5765199161425576, + "grad_norm": 3.07700777053833, + "learning_rate": 8.194937761807561e-06, + "loss": 0.7964, + "step": 3102 + }, + { + "epoch": 1.577028143065879, + "grad_norm": 2.9995245933532715, + "learning_rate": 8.193644331342639e-06, + "loss": 0.8075, + "step": 3103 + }, + { + "epoch": 1.5775363699892002, + "grad_norm": 3.1165170669555664, + "learning_rate": 8.19235053978927e-06, + "loss": 0.8286, + "step": 3104 + }, + { + "epoch": 1.5780445969125214, + "grad_norm": 3.026459217071533, + "learning_rate": 8.19105638729374e-06, + "loss": 0.7808, + "step": 3105 + }, + { + "epoch": 1.5785528238358428, + "grad_norm": 3.1128146648406982, + "learning_rate": 8.189761874002369e-06, + "loss": 0.7671, + "step": 3106 + }, + { + "epoch": 1.5790610507591638, + "grad_norm": 3.3012728691101074, + "learning_rate": 8.18846700006152e-06, + "loss": 0.8824, + "step": 3107 + }, + { + "epoch": 1.5795692776824852, + "grad_norm": 3.106581211090088, + "learning_rate": 8.187171765617598e-06, + "loss": 0.8511, + "step": 3108 + }, + { + "epoch": 1.5800775046058066, + "grad_norm": 3.08072566986084, + "learning_rate": 8.18587617081705e-06, + "loss": 0.8367, + "step": 3109 + }, + { + "epoch": 1.5805857315291276, + "grad_norm": 3.067379951477051, + "learning_rate": 8.184580215806363e-06, + "loss": 0.7869, + "step": 3110 + }, + { + "epoch": 1.581093958452449, + "grad_norm": 3.0315959453582764, + "learning_rate": 8.18328390073206e-06, + "loss": 0.8744, + "step": 3111 + }, + { + "epoch": 1.5816021853757702, + "grad_norm": 2.9520187377929688, + "learning_rate": 8.181987225740711e-06, + "loss": 0.7672, + "step": 3112 + }, + { + "epoch": 1.5821104122990914, + "grad_norm": 2.9568943977355957, + "learning_rate": 8.180690190978923e-06, + "loss": 0.8574, + "step": 3113 + }, + { + "epoch": 1.5826186392224129, + "grad_norm": 3.3284239768981934, + "learning_rate": 8.179392796593346e-06, + "loss": 0.8003, + "step": 3114 + }, + { + "epoch": 1.583126866145734, + "grad_norm": 3.1131980419158936, + "learning_rate": 8.17809504273067e-06, + "loss": 0.8305, + "step": 3115 + }, + { + "epoch": 1.5836350930690553, + "grad_norm": 3.2879278659820557, + "learning_rate": 8.176796929537622e-06, + "loss": 0.8894, + "step": 3116 + }, + { + "epoch": 1.5841433199923767, + "grad_norm": 3.3802006244659424, + "learning_rate": 8.175498457160976e-06, + "loss": 0.846, + "step": 3117 + }, + { + "epoch": 1.5846515469156979, + "grad_norm": 3.263233184814453, + "learning_rate": 8.174199625747542e-06, + "loss": 0.8689, + "step": 3118 + }, + { + "epoch": 1.585159773839019, + "grad_norm": 3.2811408042907715, + "learning_rate": 8.172900435444174e-06, + "loss": 0.8363, + "step": 3119 + }, + { + "epoch": 1.5856680007623405, + "grad_norm": 3.4866831302642822, + "learning_rate": 8.17160088639776e-06, + "loss": 0.8864, + "step": 3120 + }, + { + "epoch": 1.5861762276856617, + "grad_norm": 3.2428488731384277, + "learning_rate": 8.170300978755236e-06, + "loss": 0.8778, + "step": 3121 + }, + { + "epoch": 1.5866844546089829, + "grad_norm": 3.249417543411255, + "learning_rate": 8.169000712663577e-06, + "loss": 0.8464, + "step": 3122 + }, + { + "epoch": 1.5871926815323043, + "grad_norm": 3.479041576385498, + "learning_rate": 8.167700088269796e-06, + "loss": 0.8951, + "step": 3123 + }, + { + "epoch": 1.5877009084556253, + "grad_norm": 3.032106637954712, + "learning_rate": 8.166399105720946e-06, + "loss": 0.8026, + "step": 3124 + }, + { + "epoch": 1.5882091353789467, + "grad_norm": 2.91414737701416, + "learning_rate": 8.165097765164126e-06, + "loss": 0.8015, + "step": 3125 + }, + { + "epoch": 1.5887173623022681, + "grad_norm": 2.9475793838500977, + "learning_rate": 8.163796066746468e-06, + "loss": 0.7377, + "step": 3126 + }, + { + "epoch": 1.589225589225589, + "grad_norm": 3.372371196746826, + "learning_rate": 8.16249401061515e-06, + "loss": 0.86, + "step": 3127 + }, + { + "epoch": 1.5897338161489105, + "grad_norm": 3.2583720684051514, + "learning_rate": 8.161191596917385e-06, + "loss": 0.9854, + "step": 3128 + }, + { + "epoch": 1.5902420430722317, + "grad_norm": 3.0136237144470215, + "learning_rate": 8.159888825800439e-06, + "loss": 0.8749, + "step": 3129 + }, + { + "epoch": 1.590750269995553, + "grad_norm": 2.987494707107544, + "learning_rate": 8.158585697411601e-06, + "loss": 0.8088, + "step": 3130 + }, + { + "epoch": 1.5912584969188743, + "grad_norm": 3.117647409439087, + "learning_rate": 8.15728221189821e-06, + "loss": 0.8514, + "step": 3131 + }, + { + "epoch": 1.5917667238421955, + "grad_norm": 3.0407848358154297, + "learning_rate": 8.155978369407647e-06, + "loss": 0.9176, + "step": 3132 + }, + { + "epoch": 1.5922749507655167, + "grad_norm": 3.0892696380615234, + "learning_rate": 8.154674170087328e-06, + "loss": 0.8179, + "step": 3133 + }, + { + "epoch": 1.5927831776888381, + "grad_norm": 2.9991137981414795, + "learning_rate": 8.153369614084713e-06, + "loss": 0.8015, + "step": 3134 + }, + { + "epoch": 1.5932914046121593, + "grad_norm": 3.2096457481384277, + "learning_rate": 8.152064701547304e-06, + "loss": 0.932, + "step": 3135 + }, + { + "epoch": 1.5937996315354805, + "grad_norm": 3.3632469177246094, + "learning_rate": 8.150759432622635e-06, + "loss": 0.8488, + "step": 3136 + }, + { + "epoch": 1.594307858458802, + "grad_norm": 3.230520009994507, + "learning_rate": 8.14945380745829e-06, + "loss": 0.8742, + "step": 3137 + }, + { + "epoch": 1.5948160853821232, + "grad_norm": 3.2006702423095703, + "learning_rate": 8.148147826201887e-06, + "loss": 0.8101, + "step": 3138 + }, + { + "epoch": 1.5953243123054444, + "grad_norm": 3.0946967601776123, + "learning_rate": 8.146841489001089e-06, + "loss": 0.885, + "step": 3139 + }, + { + "epoch": 1.5958325392287658, + "grad_norm": 3.1396210193634033, + "learning_rate": 8.145534796003593e-06, + "loss": 0.8769, + "step": 3140 + }, + { + "epoch": 1.5963407661520868, + "grad_norm": 3.229386329650879, + "learning_rate": 8.144227747357142e-06, + "loss": 0.846, + "step": 3141 + }, + { + "epoch": 1.5968489930754082, + "grad_norm": 3.0499179363250732, + "learning_rate": 8.142920343209516e-06, + "loss": 0.8342, + "step": 3142 + }, + { + "epoch": 1.5973572199987296, + "grad_norm": 2.994961738586426, + "learning_rate": 8.141612583708539e-06, + "loss": 0.8829, + "step": 3143 + }, + { + "epoch": 1.5978654469220506, + "grad_norm": 2.935119390487671, + "learning_rate": 8.14030446900207e-06, + "loss": 0.8191, + "step": 3144 + }, + { + "epoch": 1.598373673845372, + "grad_norm": 3.3414881229400635, + "learning_rate": 8.138995999238011e-06, + "loss": 0.8305, + "step": 3145 + }, + { + "epoch": 1.5988819007686932, + "grad_norm": 3.234374761581421, + "learning_rate": 8.137687174564303e-06, + "loss": 0.9135, + "step": 3146 + }, + { + "epoch": 1.5993901276920144, + "grad_norm": 3.135486602783203, + "learning_rate": 8.136377995128929e-06, + "loss": 0.8391, + "step": 3147 + }, + { + "epoch": 1.5998983546153358, + "grad_norm": 2.8271825313568115, + "learning_rate": 8.135068461079912e-06, + "loss": 0.8114, + "step": 3148 + }, + { + "epoch": 1.600406581538657, + "grad_norm": 3.3534281253814697, + "learning_rate": 8.13375857256531e-06, + "loss": 0.8856, + "step": 3149 + }, + { + "epoch": 1.6009148084619782, + "grad_norm": 2.902682065963745, + "learning_rate": 8.13244832973323e-06, + "loss": 0.8384, + "step": 3150 + }, + { + "epoch": 1.6014230353852996, + "grad_norm": 3.036695718765259, + "learning_rate": 8.131137732731811e-06, + "loss": 0.9197, + "step": 3151 + }, + { + "epoch": 1.6019312623086208, + "grad_norm": 2.823070526123047, + "learning_rate": 8.129826781709239e-06, + "loss": 0.8652, + "step": 3152 + }, + { + "epoch": 1.602439489231942, + "grad_norm": 3.1444478034973145, + "learning_rate": 8.12851547681373e-06, + "loss": 0.783, + "step": 3153 + }, + { + "epoch": 1.6029477161552634, + "grad_norm": 2.9253718852996826, + "learning_rate": 8.127203818193551e-06, + "loss": 0.8148, + "step": 3154 + }, + { + "epoch": 1.6034559430785846, + "grad_norm": 3.1179044246673584, + "learning_rate": 8.125891805997005e-06, + "loss": 0.8942, + "step": 3155 + }, + { + "epoch": 1.6039641700019058, + "grad_norm": 3.1189663410186768, + "learning_rate": 8.12457944037243e-06, + "loss": 0.8296, + "step": 3156 + }, + { + "epoch": 1.6044723969252273, + "grad_norm": 3.124115228652954, + "learning_rate": 8.123266721468212e-06, + "loss": 0.8175, + "step": 3157 + }, + { + "epoch": 1.6049806238485482, + "grad_norm": 3.2029671669006348, + "learning_rate": 8.121953649432772e-06, + "loss": 0.8313, + "step": 3158 + }, + { + "epoch": 1.6054888507718696, + "grad_norm": 3.090684175491333, + "learning_rate": 8.120640224414572e-06, + "loss": 0.7608, + "step": 3159 + }, + { + "epoch": 1.605997077695191, + "grad_norm": 3.2074391841888428, + "learning_rate": 8.119326446562112e-06, + "loss": 0.864, + "step": 3160 + }, + { + "epoch": 1.606505304618512, + "grad_norm": 3.1305856704711914, + "learning_rate": 8.118012316023939e-06, + "loss": 0.8679, + "step": 3161 + }, + { + "epoch": 1.6070135315418335, + "grad_norm": 3.354135274887085, + "learning_rate": 8.11669783294863e-06, + "loss": 0.9898, + "step": 3162 + }, + { + "epoch": 1.6075217584651547, + "grad_norm": 3.194979190826416, + "learning_rate": 8.115382997484809e-06, + "loss": 0.7727, + "step": 3163 + }, + { + "epoch": 1.6080299853884759, + "grad_norm": 3.311617374420166, + "learning_rate": 8.114067809781137e-06, + "loss": 0.9731, + "step": 3164 + }, + { + "epoch": 1.6085382123117973, + "grad_norm": 3.249483585357666, + "learning_rate": 8.112752269986314e-06, + "loss": 0.8348, + "step": 3165 + }, + { + "epoch": 1.6090464392351185, + "grad_norm": 2.845046043395996, + "learning_rate": 8.111436378249085e-06, + "loss": 0.7932, + "step": 3166 + }, + { + "epoch": 1.6095546661584397, + "grad_norm": 3.1641786098480225, + "learning_rate": 8.110120134718224e-06, + "loss": 0.8059, + "step": 3167 + }, + { + "epoch": 1.610062893081761, + "grad_norm": 3.048527479171753, + "learning_rate": 8.10880353954256e-06, + "loss": 0.7602, + "step": 3168 + }, + { + "epoch": 1.6105711200050823, + "grad_norm": 3.179840564727783, + "learning_rate": 8.107486592870945e-06, + "loss": 0.9068, + "step": 3169 + }, + { + "epoch": 1.6110793469284035, + "grad_norm": 3.40436053276062, + "learning_rate": 8.106169294852288e-06, + "loss": 0.8295, + "step": 3170 + }, + { + "epoch": 1.611587573851725, + "grad_norm": 3.0481929779052734, + "learning_rate": 8.104851645635521e-06, + "loss": 0.7796, + "step": 3171 + }, + { + "epoch": 1.612095800775046, + "grad_norm": 2.995546817779541, + "learning_rate": 8.103533645369629e-06, + "loss": 0.873, + "step": 3172 + }, + { + "epoch": 1.6126040276983673, + "grad_norm": 3.2442634105682373, + "learning_rate": 8.102215294203627e-06, + "loss": 1.0155, + "step": 3173 + }, + { + "epoch": 1.6131122546216887, + "grad_norm": 3.1833958625793457, + "learning_rate": 8.100896592286579e-06, + "loss": 0.8552, + "step": 3174 + }, + { + "epoch": 1.6136204815450097, + "grad_norm": 3.268798351287842, + "learning_rate": 8.099577539767578e-06, + "loss": 0.8518, + "step": 3175 + }, + { + "epoch": 1.6141287084683311, + "grad_norm": 3.209165334701538, + "learning_rate": 8.098258136795767e-06, + "loss": 0.8605, + "step": 3176 + }, + { + "epoch": 1.6146369353916525, + "grad_norm": 3.4300894737243652, + "learning_rate": 8.096938383520323e-06, + "loss": 0.8265, + "step": 3177 + }, + { + "epoch": 1.6151451623149735, + "grad_norm": 3.218397378921509, + "learning_rate": 8.09561828009046e-06, + "loss": 0.8257, + "step": 3178 + }, + { + "epoch": 1.615653389238295, + "grad_norm": 3.162224292755127, + "learning_rate": 8.09429782665544e-06, + "loss": 0.8665, + "step": 3179 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 3.493285894393921, + "learning_rate": 8.092977023364556e-06, + "loss": 0.7889, + "step": 3180 + }, + { + "epoch": 1.6166698430849373, + "grad_norm": 3.060194492340088, + "learning_rate": 8.091655870367146e-06, + "loss": 0.8791, + "step": 3181 + }, + { + "epoch": 1.6171780700082588, + "grad_norm": 2.9806981086730957, + "learning_rate": 8.090334367812584e-06, + "loss": 0.7623, + "step": 3182 + }, + { + "epoch": 1.61768629693158, + "grad_norm": 3.182471513748169, + "learning_rate": 8.08901251585029e-06, + "loss": 0.9179, + "step": 3183 + }, + { + "epoch": 1.6181945238549011, + "grad_norm": 2.998816728591919, + "learning_rate": 8.087690314629712e-06, + "loss": 0.8197, + "step": 3184 + }, + { + "epoch": 1.6187027507782226, + "grad_norm": 2.9710581302642822, + "learning_rate": 8.086367764300352e-06, + "loss": 0.8487, + "step": 3185 + }, + { + "epoch": 1.6192109777015438, + "grad_norm": 3.1282782554626465, + "learning_rate": 8.085044865011735e-06, + "loss": 0.7931, + "step": 3186 + }, + { + "epoch": 1.619719204624865, + "grad_norm": 3.08868408203125, + "learning_rate": 8.083721616913441e-06, + "loss": 0.8249, + "step": 3187 + }, + { + "epoch": 1.6202274315481864, + "grad_norm": 3.246670722961426, + "learning_rate": 8.08239802015508e-06, + "loss": 0.7897, + "step": 3188 + }, + { + "epoch": 1.6207356584715076, + "grad_norm": 3.136277437210083, + "learning_rate": 8.081074074886303e-06, + "loss": 0.8653, + "step": 3189 + }, + { + "epoch": 1.6212438853948288, + "grad_norm": 3.2505767345428467, + "learning_rate": 8.079749781256806e-06, + "loss": 0.8833, + "step": 3190 + }, + { + "epoch": 1.6217521123181502, + "grad_norm": 3.1759870052337646, + "learning_rate": 8.078425139416314e-06, + "loss": 0.8268, + "step": 3191 + }, + { + "epoch": 1.6222603392414712, + "grad_norm": 3.146894931793213, + "learning_rate": 8.077100149514601e-06, + "loss": 0.7529, + "step": 3192 + }, + { + "epoch": 1.6227685661647926, + "grad_norm": 3.2407381534576416, + "learning_rate": 8.075774811701477e-06, + "loss": 0.8144, + "step": 3193 + }, + { + "epoch": 1.6232767930881138, + "grad_norm": 3.037705421447754, + "learning_rate": 8.074449126126788e-06, + "loss": 0.8034, + "step": 3194 + }, + { + "epoch": 1.623785020011435, + "grad_norm": 3.1687588691711426, + "learning_rate": 8.073123092940424e-06, + "loss": 0.8729, + "step": 3195 + }, + { + "epoch": 1.6242932469347564, + "grad_norm": 3.1716537475585938, + "learning_rate": 8.071796712292313e-06, + "loss": 0.8498, + "step": 3196 + }, + { + "epoch": 1.6248014738580776, + "grad_norm": 3.520030975341797, + "learning_rate": 8.070469984332421e-06, + "loss": 0.9367, + "step": 3197 + }, + { + "epoch": 1.6253097007813988, + "grad_norm": 3.2190101146698, + "learning_rate": 8.069142909210755e-06, + "loss": 0.7717, + "step": 3198 + }, + { + "epoch": 1.6258179277047202, + "grad_norm": 3.204716205596924, + "learning_rate": 8.067815487077357e-06, + "loss": 0.9277, + "step": 3199 + }, + { + "epoch": 1.6263261546280414, + "grad_norm": 2.906593084335327, + "learning_rate": 8.066487718082316e-06, + "loss": 0.8637, + "step": 3200 + }, + { + "epoch": 1.6268343815513626, + "grad_norm": 3.077334403991699, + "learning_rate": 8.065159602375754e-06, + "loss": 0.8172, + "step": 3201 + }, + { + "epoch": 1.627342608474684, + "grad_norm": 3.0299875736236572, + "learning_rate": 8.063831140107834e-06, + "loss": 0.8891, + "step": 3202 + }, + { + "epoch": 1.6278508353980052, + "grad_norm": 3.038489580154419, + "learning_rate": 8.06250233142876e-06, + "loss": 0.8571, + "step": 3203 + }, + { + "epoch": 1.6283590623213264, + "grad_norm": 3.1936428546905518, + "learning_rate": 8.061173176488769e-06, + "loss": 0.8191, + "step": 3204 + }, + { + "epoch": 1.6288672892446479, + "grad_norm": 2.9855539798736572, + "learning_rate": 8.059843675438144e-06, + "loss": 0.8109, + "step": 3205 + }, + { + "epoch": 1.629375516167969, + "grad_norm": 3.1044156551361084, + "learning_rate": 8.058513828427206e-06, + "loss": 0.8533, + "step": 3206 + }, + { + "epoch": 1.6298837430912902, + "grad_norm": 3.6107935905456543, + "learning_rate": 8.057183635606312e-06, + "loss": 0.9247, + "step": 3207 + }, + { + "epoch": 1.6303919700146117, + "grad_norm": 3.2537338733673096, + "learning_rate": 8.055853097125858e-06, + "loss": 0.8406, + "step": 3208 + }, + { + "epoch": 1.6309001969379326, + "grad_norm": 3.1173675060272217, + "learning_rate": 8.054522213136287e-06, + "loss": 0.7766, + "step": 3209 + }, + { + "epoch": 1.631408423861254, + "grad_norm": 3.2477848529815674, + "learning_rate": 8.05319098378807e-06, + "loss": 0.8719, + "step": 3210 + }, + { + "epoch": 1.6319166507845753, + "grad_norm": 3.6281533241271973, + "learning_rate": 8.051859409231723e-06, + "loss": 0.8705, + "step": 3211 + }, + { + "epoch": 1.6324248777078965, + "grad_norm": 3.104458808898926, + "learning_rate": 8.0505274896178e-06, + "loss": 0.8385, + "step": 3212 + }, + { + "epoch": 1.6329331046312179, + "grad_norm": 3.092541456222534, + "learning_rate": 8.049195225096897e-06, + "loss": 0.9495, + "step": 3213 + }, + { + "epoch": 1.633441331554539, + "grad_norm": 3.2451331615448, + "learning_rate": 8.047862615819642e-06, + "loss": 0.8221, + "step": 3214 + }, + { + "epoch": 1.6339495584778603, + "grad_norm": 3.119635820388794, + "learning_rate": 8.046529661936707e-06, + "loss": 0.8372, + "step": 3215 + }, + { + "epoch": 1.6344577854011817, + "grad_norm": 3.5131008625030518, + "learning_rate": 8.045196363598802e-06, + "loss": 0.897, + "step": 3216 + }, + { + "epoch": 1.634966012324503, + "grad_norm": 3.041543960571289, + "learning_rate": 8.04386272095668e-06, + "loss": 0.8203, + "step": 3217 + }, + { + "epoch": 1.635474239247824, + "grad_norm": 4.3333587646484375, + "learning_rate": 8.042528734161123e-06, + "loss": 0.8801, + "step": 3218 + }, + { + "epoch": 1.6359824661711455, + "grad_norm": 3.03456974029541, + "learning_rate": 8.04119440336296e-06, + "loss": 0.8274, + "step": 3219 + }, + { + "epoch": 1.6364906930944667, + "grad_norm": 3.17861008644104, + "learning_rate": 8.039859728713058e-06, + "loss": 0.8268, + "step": 3220 + }, + { + "epoch": 1.636998920017788, + "grad_norm": 3.216559648513794, + "learning_rate": 8.038524710362321e-06, + "loss": 0.8748, + "step": 3221 + }, + { + "epoch": 1.6375071469411093, + "grad_norm": 2.9259185791015625, + "learning_rate": 8.037189348461692e-06, + "loss": 0.8382, + "step": 3222 + }, + { + "epoch": 1.6380153738644303, + "grad_norm": 2.898538589477539, + "learning_rate": 8.035853643162153e-06, + "loss": 0.7463, + "step": 3223 + }, + { + "epoch": 1.6385236007877517, + "grad_norm": 3.110093593597412, + "learning_rate": 8.034517594614726e-06, + "loss": 0.8093, + "step": 3224 + }, + { + "epoch": 1.6390318277110731, + "grad_norm": 3.151292085647583, + "learning_rate": 8.033181202970471e-06, + "loss": 0.8397, + "step": 3225 + }, + { + "epoch": 1.6395400546343941, + "grad_norm": 3.235694408416748, + "learning_rate": 8.031844468380485e-06, + "loss": 0.9665, + "step": 3226 + }, + { + "epoch": 1.6400482815577155, + "grad_norm": 3.0993845462799072, + "learning_rate": 8.030507390995907e-06, + "loss": 0.8412, + "step": 3227 + }, + { + "epoch": 1.6405565084810367, + "grad_norm": 3.3848185539245605, + "learning_rate": 8.029169970967914e-06, + "loss": 0.9206, + "step": 3228 + }, + { + "epoch": 1.641064735404358, + "grad_norm": 3.3129689693450928, + "learning_rate": 8.027832208447719e-06, + "loss": 0.8809, + "step": 3229 + }, + { + "epoch": 1.6415729623276794, + "grad_norm": 3.0754380226135254, + "learning_rate": 8.026494103586577e-06, + "loss": 0.804, + "step": 3230 + }, + { + "epoch": 1.6420811892510006, + "grad_norm": 3.0243043899536133, + "learning_rate": 8.025155656535782e-06, + "loss": 0.7182, + "step": 3231 + }, + { + "epoch": 1.6425894161743217, + "grad_norm": 3.0670719146728516, + "learning_rate": 8.02381686744666e-06, + "loss": 0.8181, + "step": 3232 + }, + { + "epoch": 1.6430976430976432, + "grad_norm": 3.205423355102539, + "learning_rate": 8.022477736470584e-06, + "loss": 0.8251, + "step": 3233 + }, + { + "epoch": 1.6436058700209644, + "grad_norm": 3.2314603328704834, + "learning_rate": 8.021138263758966e-06, + "loss": 0.8689, + "step": 3234 + }, + { + "epoch": 1.6441140969442856, + "grad_norm": 3.0328774452209473, + "learning_rate": 8.019798449463248e-06, + "loss": 0.7866, + "step": 3235 + }, + { + "epoch": 1.644622323867607, + "grad_norm": 3.1050779819488525, + "learning_rate": 8.018458293734917e-06, + "loss": 0.8379, + "step": 3236 + }, + { + "epoch": 1.6451305507909282, + "grad_norm": 3.1296982765197754, + "learning_rate": 8.017117796725495e-06, + "loss": 0.7903, + "step": 3237 + }, + { + "epoch": 1.6456387777142494, + "grad_norm": 3.1918692588806152, + "learning_rate": 8.015776958586553e-06, + "loss": 0.8031, + "step": 3238 + }, + { + "epoch": 1.6461470046375708, + "grad_norm": 3.2104053497314453, + "learning_rate": 8.014435779469682e-06, + "loss": 0.866, + "step": 3239 + }, + { + "epoch": 1.6466552315608918, + "grad_norm": 3.264033079147339, + "learning_rate": 8.013094259526528e-06, + "loss": 0.824, + "step": 3240 + }, + { + "epoch": 1.6471634584842132, + "grad_norm": 3.0460946559906006, + "learning_rate": 8.011752398908771e-06, + "loss": 0.824, + "step": 3241 + }, + { + "epoch": 1.6476716854075346, + "grad_norm": 3.3134658336639404, + "learning_rate": 8.010410197768123e-06, + "loss": 0.8077, + "step": 3242 + }, + { + "epoch": 1.6481799123308556, + "grad_norm": 3.2771031856536865, + "learning_rate": 8.009067656256344e-06, + "loss": 0.8466, + "step": 3243 + }, + { + "epoch": 1.648688139254177, + "grad_norm": 3.121896982192993, + "learning_rate": 8.007724774525225e-06, + "loss": 0.7764, + "step": 3244 + }, + { + "epoch": 1.6491963661774982, + "grad_norm": 3.2331111431121826, + "learning_rate": 8.006381552726601e-06, + "loss": 0.7678, + "step": 3245 + }, + { + "epoch": 1.6497045931008194, + "grad_norm": 3.142518997192383, + "learning_rate": 8.005037991012341e-06, + "loss": 0.8648, + "step": 3246 + }, + { + "epoch": 1.6502128200241408, + "grad_norm": 3.501854181289673, + "learning_rate": 8.003694089534355e-06, + "loss": 0.7738, + "step": 3247 + }, + { + "epoch": 1.650721046947462, + "grad_norm": 3.3636884689331055, + "learning_rate": 8.00234984844459e-06, + "loss": 0.8262, + "step": 3248 + }, + { + "epoch": 1.6512292738707832, + "grad_norm": 3.1698949337005615, + "learning_rate": 8.001005267895034e-06, + "loss": 0.8882, + "step": 3249 + }, + { + "epoch": 1.6517375007941046, + "grad_norm": 3.1779544353485107, + "learning_rate": 7.999660348037713e-06, + "loss": 0.9491, + "step": 3250 + }, + { + "epoch": 1.6522457277174258, + "grad_norm": 3.0099754333496094, + "learning_rate": 7.998315089024684e-06, + "loss": 0.7621, + "step": 3251 + }, + { + "epoch": 1.652753954640747, + "grad_norm": 3.006117582321167, + "learning_rate": 7.996969491008054e-06, + "loss": 0.7613, + "step": 3252 + }, + { + "epoch": 1.6532621815640685, + "grad_norm": 3.1954116821289062, + "learning_rate": 7.99562355413996e-06, + "loss": 0.9564, + "step": 3253 + }, + { + "epoch": 1.6537704084873897, + "grad_norm": 3.165761947631836, + "learning_rate": 7.994277278572581e-06, + "loss": 0.8525, + "step": 3254 + }, + { + "epoch": 1.6542786354107109, + "grad_norm": 2.9796812534332275, + "learning_rate": 7.992930664458131e-06, + "loss": 0.7416, + "step": 3255 + }, + { + "epoch": 1.6547868623340323, + "grad_norm": 3.133790969848633, + "learning_rate": 7.99158371194887e-06, + "loss": 0.8482, + "step": 3256 + }, + { + "epoch": 1.6552950892573532, + "grad_norm": 3.0982847213745117, + "learning_rate": 7.990236421197084e-06, + "loss": 0.8582, + "step": 3257 + }, + { + "epoch": 1.6558033161806747, + "grad_norm": 3.39365816116333, + "learning_rate": 7.98888879235511e-06, + "loss": 0.8901, + "step": 3258 + }, + { + "epoch": 1.656311543103996, + "grad_norm": 3.165888547897339, + "learning_rate": 7.987540825575313e-06, + "loss": 0.9455, + "step": 3259 + }, + { + "epoch": 1.656819770027317, + "grad_norm": 3.2440237998962402, + "learning_rate": 7.986192521010103e-06, + "loss": 0.7762, + "step": 3260 + }, + { + "epoch": 1.6573279969506385, + "grad_norm": 3.042271375656128, + "learning_rate": 7.984843878811924e-06, + "loss": 0.8588, + "step": 3261 + }, + { + "epoch": 1.6578362238739597, + "grad_norm": 3.1160874366760254, + "learning_rate": 7.983494899133259e-06, + "loss": 0.8799, + "step": 3262 + }, + { + "epoch": 1.6583444507972809, + "grad_norm": 3.0635807514190674, + "learning_rate": 7.982145582126633e-06, + "loss": 0.817, + "step": 3263 + }, + { + "epoch": 1.6588526777206023, + "grad_norm": 3.40155029296875, + "learning_rate": 7.980795927944602e-06, + "loss": 0.9681, + "step": 3264 + }, + { + "epoch": 1.6593609046439235, + "grad_norm": 3.1403932571411133, + "learning_rate": 7.979445936739769e-06, + "loss": 0.833, + "step": 3265 + }, + { + "epoch": 1.6598691315672447, + "grad_norm": 3.3115484714508057, + "learning_rate": 7.97809560866477e-06, + "loss": 0.8623, + "step": 3266 + }, + { + "epoch": 1.6603773584905661, + "grad_norm": 3.2069787979125977, + "learning_rate": 7.976744943872274e-06, + "loss": 0.821, + "step": 3267 + }, + { + "epoch": 1.6608855854138873, + "grad_norm": 3.360119581222534, + "learning_rate": 7.975393942514998e-06, + "loss": 0.8245, + "step": 3268 + }, + { + "epoch": 1.6613938123372085, + "grad_norm": 3.2077269554138184, + "learning_rate": 7.974042604745692e-06, + "loss": 0.8357, + "step": 3269 + }, + { + "epoch": 1.66190203926053, + "grad_norm": 2.924471616744995, + "learning_rate": 7.972690930717145e-06, + "loss": 0.7866, + "step": 3270 + }, + { + "epoch": 1.6624102661838511, + "grad_norm": 3.392030715942383, + "learning_rate": 7.971338920582182e-06, + "loss": 0.8965, + "step": 3271 + }, + { + "epoch": 1.6629184931071723, + "grad_norm": 2.932337522506714, + "learning_rate": 7.969986574493667e-06, + "loss": 0.7455, + "step": 3272 + }, + { + "epoch": 1.6634267200304937, + "grad_norm": 3.115884780883789, + "learning_rate": 7.968633892604508e-06, + "loss": 0.8043, + "step": 3273 + }, + { + "epoch": 1.6639349469538147, + "grad_norm": 3.195850372314453, + "learning_rate": 7.967280875067638e-06, + "loss": 0.871, + "step": 3274 + }, + { + "epoch": 1.6644431738771361, + "grad_norm": 3.040839433670044, + "learning_rate": 7.965927522036041e-06, + "loss": 0.867, + "step": 3275 + }, + { + "epoch": 1.6649514008004576, + "grad_norm": 3.0806403160095215, + "learning_rate": 7.964573833662731e-06, + "loss": 0.8094, + "step": 3276 + }, + { + "epoch": 1.6654596277237785, + "grad_norm": 2.9806809425354004, + "learning_rate": 7.963219810100765e-06, + "loss": 0.9022, + "step": 3277 + }, + { + "epoch": 1.6659678546471, + "grad_norm": 3.1467132568359375, + "learning_rate": 7.96186545150323e-06, + "loss": 0.8511, + "step": 3278 + }, + { + "epoch": 1.6664760815704212, + "grad_norm": 2.929919481277466, + "learning_rate": 7.960510758023261e-06, + "loss": 0.8277, + "step": 3279 + }, + { + "epoch": 1.6669843084937424, + "grad_norm": 3.274540662765503, + "learning_rate": 7.959155729814025e-06, + "loss": 0.8846, + "step": 3280 + }, + { + "epoch": 1.6674925354170638, + "grad_norm": 2.9907915592193604, + "learning_rate": 7.957800367028726e-06, + "loss": 0.7783, + "step": 3281 + }, + { + "epoch": 1.668000762340385, + "grad_norm": 3.237807035446167, + "learning_rate": 7.956444669820611e-06, + "loss": 0.7738, + "step": 3282 + }, + { + "epoch": 1.6685089892637062, + "grad_norm": 2.7499542236328125, + "learning_rate": 7.955088638342959e-06, + "loss": 0.7801, + "step": 3283 + }, + { + "epoch": 1.6690172161870276, + "grad_norm": 3.229651927947998, + "learning_rate": 7.953732272749089e-06, + "loss": 0.8682, + "step": 3284 + }, + { + "epoch": 1.6695254431103488, + "grad_norm": 2.972989320755005, + "learning_rate": 7.95237557319236e-06, + "loss": 0.791, + "step": 3285 + }, + { + "epoch": 1.67003367003367, + "grad_norm": 3.0465450286865234, + "learning_rate": 7.951018539826162e-06, + "loss": 0.7577, + "step": 3286 + }, + { + "epoch": 1.6705418969569914, + "grad_norm": 3.4167490005493164, + "learning_rate": 7.949661172803935e-06, + "loss": 0.9066, + "step": 3287 + }, + { + "epoch": 1.6710501238803126, + "grad_norm": 3.232654094696045, + "learning_rate": 7.948303472279144e-06, + "loss": 0.8161, + "step": 3288 + }, + { + "epoch": 1.6715583508036338, + "grad_norm": 3.0992579460144043, + "learning_rate": 7.9469454384053e-06, + "loss": 0.8447, + "step": 3289 + }, + { + "epoch": 1.6720665777269552, + "grad_norm": 3.0505714416503906, + "learning_rate": 7.945587071335948e-06, + "loss": 0.7353, + "step": 3290 + }, + { + "epoch": 1.6725748046502762, + "grad_norm": 2.9668524265289307, + "learning_rate": 7.944228371224667e-06, + "loss": 0.7479, + "step": 3291 + }, + { + "epoch": 1.6730830315735976, + "grad_norm": 3.2085092067718506, + "learning_rate": 7.942869338225086e-06, + "loss": 0.9215, + "step": 3292 + }, + { + "epoch": 1.673591258496919, + "grad_norm": 3.120911121368408, + "learning_rate": 7.941509972490856e-06, + "loss": 0.852, + "step": 3293 + }, + { + "epoch": 1.67409948542024, + "grad_norm": 3.314965009689331, + "learning_rate": 7.940150274175677e-06, + "loss": 0.8492, + "step": 3294 + }, + { + "epoch": 1.6746077123435614, + "grad_norm": 3.2626428604125977, + "learning_rate": 7.938790243433285e-06, + "loss": 0.922, + "step": 3295 + }, + { + "epoch": 1.6751159392668826, + "grad_norm": 3.409306049346924, + "learning_rate": 7.937429880417447e-06, + "loss": 0.8554, + "step": 3296 + }, + { + "epoch": 1.6756241661902038, + "grad_norm": 3.1044716835021973, + "learning_rate": 7.936069185281974e-06, + "loss": 0.8706, + "step": 3297 + }, + { + "epoch": 1.6761323931135252, + "grad_norm": 3.5342752933502197, + "learning_rate": 7.934708158180713e-06, + "loss": 0.8668, + "step": 3298 + }, + { + "epoch": 1.6766406200368464, + "grad_norm": 3.315814971923828, + "learning_rate": 7.933346799267548e-06, + "loss": 0.7991, + "step": 3299 + }, + { + "epoch": 1.6771488469601676, + "grad_norm": 2.979701280593872, + "learning_rate": 7.931985108696401e-06, + "loss": 0.8347, + "step": 3300 + }, + { + "epoch": 1.677657073883489, + "grad_norm": 3.1003923416137695, + "learning_rate": 7.93062308662123e-06, + "loss": 0.8468, + "step": 3301 + }, + { + "epoch": 1.6781653008068103, + "grad_norm": 3.2387659549713135, + "learning_rate": 7.929260733196032e-06, + "loss": 0.9182, + "step": 3302 + }, + { + "epoch": 1.6786735277301315, + "grad_norm": 3.1733248233795166, + "learning_rate": 7.927898048574841e-06, + "loss": 0.8444, + "step": 3303 + }, + { + "epoch": 1.6791817546534529, + "grad_norm": 3.23020076751709, + "learning_rate": 7.926535032911728e-06, + "loss": 0.8839, + "step": 3304 + }, + { + "epoch": 1.679689981576774, + "grad_norm": 3.439688205718994, + "learning_rate": 7.925171686360803e-06, + "loss": 0.8456, + "step": 3305 + }, + { + "epoch": 1.6801982085000953, + "grad_norm": 3.2128794193267822, + "learning_rate": 7.923808009076213e-06, + "loss": 0.9149, + "step": 3306 + }, + { + "epoch": 1.6807064354234167, + "grad_norm": 2.9014220237731934, + "learning_rate": 7.922444001212139e-06, + "loss": 0.7875, + "step": 3307 + }, + { + "epoch": 1.6812146623467377, + "grad_norm": 3.3359878063201904, + "learning_rate": 7.921079662922806e-06, + "loss": 0.858, + "step": 3308 + }, + { + "epoch": 1.681722889270059, + "grad_norm": 2.9604530334472656, + "learning_rate": 7.919714994362471e-06, + "loss": 0.7724, + "step": 3309 + }, + { + "epoch": 1.6822311161933805, + "grad_norm": 3.2349345684051514, + "learning_rate": 7.918349995685428e-06, + "loss": 0.8352, + "step": 3310 + }, + { + "epoch": 1.6827393431167015, + "grad_norm": 2.8869545459747314, + "learning_rate": 7.916984667046012e-06, + "loss": 0.7956, + "step": 3311 + }, + { + "epoch": 1.683247570040023, + "grad_norm": 3.074676036834717, + "learning_rate": 7.915619008598592e-06, + "loss": 0.8504, + "step": 3312 + }, + { + "epoch": 1.683755796963344, + "grad_norm": 3.1231634616851807, + "learning_rate": 7.914253020497577e-06, + "loss": 0.7753, + "step": 3313 + }, + { + "epoch": 1.6842640238866653, + "grad_norm": 3.1155753135681152, + "learning_rate": 7.912886702897413e-06, + "loss": 0.8855, + "step": 3314 + }, + { + "epoch": 1.6847722508099867, + "grad_norm": 3.1568148136138916, + "learning_rate": 7.911520055952581e-06, + "loss": 0.8406, + "step": 3315 + }, + { + "epoch": 1.685280477733308, + "grad_norm": 3.1358795166015625, + "learning_rate": 7.9101530798176e-06, + "loss": 0.8323, + "step": 3316 + }, + { + "epoch": 1.6857887046566291, + "grad_norm": 3.40761661529541, + "learning_rate": 7.908785774647028e-06, + "loss": 0.8595, + "step": 3317 + }, + { + "epoch": 1.6862969315799505, + "grad_norm": 3.5222272872924805, + "learning_rate": 7.907418140595456e-06, + "loss": 0.9113, + "step": 3318 + }, + { + "epoch": 1.6868051585032717, + "grad_norm": 3.2144367694854736, + "learning_rate": 7.906050177817519e-06, + "loss": 0.8071, + "step": 3319 + }, + { + "epoch": 1.687313385426593, + "grad_norm": 3.3410897254943848, + "learning_rate": 7.904681886467885e-06, + "loss": 0.8993, + "step": 3320 + }, + { + "epoch": 1.6878216123499143, + "grad_norm": 2.950131416320801, + "learning_rate": 7.903313266701256e-06, + "loss": 0.8409, + "step": 3321 + }, + { + "epoch": 1.6883298392732355, + "grad_norm": 3.1286795139312744, + "learning_rate": 7.901944318672377e-06, + "loss": 0.7937, + "step": 3322 + }, + { + "epoch": 1.6888380661965567, + "grad_norm": 3.1939430236816406, + "learning_rate": 7.90057504253603e-06, + "loss": 0.8466, + "step": 3323 + }, + { + "epoch": 1.6893462931198782, + "grad_norm": 3.1400716304779053, + "learning_rate": 7.899205438447028e-06, + "loss": 0.8976, + "step": 3324 + }, + { + "epoch": 1.6898545200431991, + "grad_norm": 3.1489381790161133, + "learning_rate": 7.897835506560226e-06, + "loss": 0.8472, + "step": 3325 + }, + { + "epoch": 1.6903627469665206, + "grad_norm": 3.195754289627075, + "learning_rate": 7.896465247030514e-06, + "loss": 0.8202, + "step": 3326 + }, + { + "epoch": 1.690870973889842, + "grad_norm": 3.4317686557769775, + "learning_rate": 7.895094660012821e-06, + "loss": 0.9097, + "step": 3327 + }, + { + "epoch": 1.691379200813163, + "grad_norm": 3.1709091663360596, + "learning_rate": 7.893723745662114e-06, + "loss": 0.855, + "step": 3328 + }, + { + "epoch": 1.6918874277364844, + "grad_norm": 3.0010886192321777, + "learning_rate": 7.892352504133391e-06, + "loss": 0.8307, + "step": 3329 + }, + { + "epoch": 1.6923956546598056, + "grad_norm": 3.4652211666107178, + "learning_rate": 7.890980935581695e-06, + "loss": 0.8842, + "step": 3330 + }, + { + "epoch": 1.6929038815831268, + "grad_norm": 3.257430076599121, + "learning_rate": 7.8896090401621e-06, + "loss": 0.8528, + "step": 3331 + }, + { + "epoch": 1.6934121085064482, + "grad_norm": 3.176788806915283, + "learning_rate": 7.88823681802972e-06, + "loss": 0.8534, + "step": 3332 + }, + { + "epoch": 1.6939203354297694, + "grad_norm": 3.334630250930786, + "learning_rate": 7.886864269339703e-06, + "loss": 0.9219, + "step": 3333 + }, + { + "epoch": 1.6944285623530906, + "grad_norm": 3.25536847114563, + "learning_rate": 7.885491394247236e-06, + "loss": 0.9077, + "step": 3334 + }, + { + "epoch": 1.694936789276412, + "grad_norm": 3.5795812606811523, + "learning_rate": 7.884118192907543e-06, + "loss": 0.8206, + "step": 3335 + }, + { + "epoch": 1.6954450161997332, + "grad_norm": 3.35133957862854, + "learning_rate": 7.882744665475886e-06, + "loss": 0.8804, + "step": 3336 + }, + { + "epoch": 1.6959532431230544, + "grad_norm": 3.3669703006744385, + "learning_rate": 7.881370812107563e-06, + "loss": 0.7694, + "step": 3337 + }, + { + "epoch": 1.6964614700463758, + "grad_norm": 3.38563871383667, + "learning_rate": 7.879996632957904e-06, + "loss": 0.7634, + "step": 3338 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 3.5372822284698486, + "learning_rate": 7.878622128182285e-06, + "loss": 0.929, + "step": 3339 + }, + { + "epoch": 1.6974779238930182, + "grad_norm": 3.052685022354126, + "learning_rate": 7.87724729793611e-06, + "loss": 0.9244, + "step": 3340 + }, + { + "epoch": 1.6979861508163396, + "grad_norm": 3.0986926555633545, + "learning_rate": 7.87587214237483e-06, + "loss": 0.9117, + "step": 3341 + }, + { + "epoch": 1.6984943777396606, + "grad_norm": 3.1174423694610596, + "learning_rate": 7.874496661653918e-06, + "loss": 0.8043, + "step": 3342 + }, + { + "epoch": 1.699002604662982, + "grad_norm": 3.176779270172119, + "learning_rate": 7.8731208559289e-06, + "loss": 0.838, + "step": 3343 + }, + { + "epoch": 1.6995108315863034, + "grad_norm": 3.2106759548187256, + "learning_rate": 7.871744725355324e-06, + "loss": 0.8462, + "step": 3344 + }, + { + "epoch": 1.7000190585096244, + "grad_norm": 3.2538504600524902, + "learning_rate": 7.870368270088789e-06, + "loss": 0.8153, + "step": 3345 + }, + { + "epoch": 1.7005272854329458, + "grad_norm": 2.95824933052063, + "learning_rate": 7.868991490284919e-06, + "loss": 0.8539, + "step": 3346 + }, + { + "epoch": 1.701035512356267, + "grad_norm": 3.3431808948516846, + "learning_rate": 7.86761438609938e-06, + "loss": 0.841, + "step": 3347 + }, + { + "epoch": 1.7015437392795882, + "grad_norm": 2.930280923843384, + "learning_rate": 7.866236957687874e-06, + "loss": 0.7645, + "step": 3348 + }, + { + "epoch": 1.7020519662029097, + "grad_norm": 3.450204610824585, + "learning_rate": 7.864859205206138e-06, + "loss": 0.8391, + "step": 3349 + }, + { + "epoch": 1.7025601931262309, + "grad_norm": 3.246631383895874, + "learning_rate": 7.863481128809952e-06, + "loss": 0.9022, + "step": 3350 + }, + { + "epoch": 1.703068420049552, + "grad_norm": 3.306354284286499, + "learning_rate": 7.862102728655122e-06, + "loss": 0.8004, + "step": 3351 + }, + { + "epoch": 1.7035766469728735, + "grad_norm": 3.3001654148101807, + "learning_rate": 7.8607240048975e-06, + "loss": 0.7788, + "step": 3352 + }, + { + "epoch": 1.7040848738961947, + "grad_norm": 2.9877474308013916, + "learning_rate": 7.859344957692972e-06, + "loss": 0.7975, + "step": 3353 + }, + { + "epoch": 1.7045931008195159, + "grad_norm": 3.221864938735962, + "learning_rate": 7.857965587197453e-06, + "loss": 0.9618, + "step": 3354 + }, + { + "epoch": 1.7051013277428373, + "grad_norm": 3.1139442920684814, + "learning_rate": 7.856585893566909e-06, + "loss": 0.7589, + "step": 3355 + }, + { + "epoch": 1.7056095546661585, + "grad_norm": 3.3803412914276123, + "learning_rate": 7.855205876957331e-06, + "loss": 0.8664, + "step": 3356 + }, + { + "epoch": 1.7061177815894797, + "grad_norm": 3.4577863216400146, + "learning_rate": 7.853825537524751e-06, + "loss": 0.9056, + "step": 3357 + }, + { + "epoch": 1.706626008512801, + "grad_norm": 2.8583593368530273, + "learning_rate": 7.852444875425234e-06, + "loss": 0.816, + "step": 3358 + }, + { + "epoch": 1.707134235436122, + "grad_norm": 2.987264394760132, + "learning_rate": 7.851063890814888e-06, + "loss": 0.8476, + "step": 3359 + }, + { + "epoch": 1.7076424623594435, + "grad_norm": 3.083441972732544, + "learning_rate": 7.84968258384985e-06, + "loss": 0.8287, + "step": 3360 + }, + { + "epoch": 1.7081506892827647, + "grad_norm": 3.007948160171509, + "learning_rate": 7.848300954686302e-06, + "loss": 0.8696, + "step": 3361 + }, + { + "epoch": 1.708658916206086, + "grad_norm": 3.2169318199157715, + "learning_rate": 7.846919003480453e-06, + "loss": 0.8461, + "step": 3362 + }, + { + "epoch": 1.7091671431294073, + "grad_norm": 3.058051586151123, + "learning_rate": 7.845536730388555e-06, + "loss": 0.7913, + "step": 3363 + }, + { + "epoch": 1.7096753700527285, + "grad_norm": 3.4415318965911865, + "learning_rate": 7.844154135566892e-06, + "loss": 0.8106, + "step": 3364 + }, + { + "epoch": 1.7101835969760497, + "grad_norm": 3.4176740646362305, + "learning_rate": 7.84277121917179e-06, + "loss": 0.8788, + "step": 3365 + }, + { + "epoch": 1.7106918238993711, + "grad_norm": 3.1206893920898438, + "learning_rate": 7.841387981359606e-06, + "loss": 0.8288, + "step": 3366 + }, + { + "epoch": 1.7112000508226923, + "grad_norm": 3.396747589111328, + "learning_rate": 7.840004422286735e-06, + "loss": 0.8438, + "step": 3367 + }, + { + "epoch": 1.7117082777460135, + "grad_norm": 3.080991744995117, + "learning_rate": 7.83862054210961e-06, + "loss": 0.7587, + "step": 3368 + }, + { + "epoch": 1.712216504669335, + "grad_norm": 3.3959927558898926, + "learning_rate": 7.837236340984699e-06, + "loss": 0.8476, + "step": 3369 + }, + { + "epoch": 1.7127247315926561, + "grad_norm": 3.123796224594116, + "learning_rate": 7.835851819068505e-06, + "loss": 0.8816, + "step": 3370 + }, + { + "epoch": 1.7132329585159773, + "grad_norm": 3.062106132507324, + "learning_rate": 7.834466976517569e-06, + "loss": 0.91, + "step": 3371 + }, + { + "epoch": 1.7137411854392988, + "grad_norm": 3.0195693969726562, + "learning_rate": 7.833081813488468e-06, + "loss": 0.7959, + "step": 3372 + }, + { + "epoch": 1.71424941236262, + "grad_norm": 3.146942377090454, + "learning_rate": 7.831696330137817e-06, + "loss": 0.882, + "step": 3373 + }, + { + "epoch": 1.7147576392859412, + "grad_norm": 3.1216883659362793, + "learning_rate": 7.830310526622261e-06, + "loss": 0.8257, + "step": 3374 + }, + { + "epoch": 1.7152658662092626, + "grad_norm": 3.703882932662964, + "learning_rate": 7.82892440309849e-06, + "loss": 0.818, + "step": 3375 + }, + { + "epoch": 1.7157740931325836, + "grad_norm": 3.1644270420074463, + "learning_rate": 7.827537959723222e-06, + "loss": 0.8017, + "step": 3376 + }, + { + "epoch": 1.716282320055905, + "grad_norm": 3.155853033065796, + "learning_rate": 7.826151196653216e-06, + "loss": 0.8255, + "step": 3377 + }, + { + "epoch": 1.7167905469792262, + "grad_norm": 3.078758716583252, + "learning_rate": 7.82476411404527e-06, + "loss": 0.7639, + "step": 3378 + }, + { + "epoch": 1.7172987739025474, + "grad_norm": 2.952954053878784, + "learning_rate": 7.823376712056205e-06, + "loss": 0.8544, + "step": 3379 + }, + { + "epoch": 1.7178070008258688, + "grad_norm": 3.054943323135376, + "learning_rate": 7.821988990842895e-06, + "loss": 0.8404, + "step": 3380 + }, + { + "epoch": 1.71831522774919, + "grad_norm": 2.981538772583008, + "learning_rate": 7.82060095056224e-06, + "loss": 0.9251, + "step": 3381 + }, + { + "epoch": 1.7188234546725112, + "grad_norm": 3.3590853214263916, + "learning_rate": 7.819212591371178e-06, + "loss": 0.9167, + "step": 3382 + }, + { + "epoch": 1.7193316815958326, + "grad_norm": 3.1496026515960693, + "learning_rate": 7.817823913426682e-06, + "loss": 0.898, + "step": 3383 + }, + { + "epoch": 1.7198399085191538, + "grad_norm": 3.1727194786071777, + "learning_rate": 7.816434916885767e-06, + "loss": 0.876, + "step": 3384 + }, + { + "epoch": 1.720348135442475, + "grad_norm": 3.156569004058838, + "learning_rate": 7.815045601905475e-06, + "loss": 0.8331, + "step": 3385 + }, + { + "epoch": 1.7208563623657964, + "grad_norm": 2.845827341079712, + "learning_rate": 7.81365596864289e-06, + "loss": 0.8177, + "step": 3386 + }, + { + "epoch": 1.7213645892891176, + "grad_norm": 3.048043966293335, + "learning_rate": 7.812266017255132e-06, + "loss": 0.8451, + "step": 3387 + }, + { + "epoch": 1.7218728162124388, + "grad_norm": 3.1950175762176514, + "learning_rate": 7.810875747899352e-06, + "loss": 0.8593, + "step": 3388 + }, + { + "epoch": 1.7223810431357602, + "grad_norm": 3.315939426422119, + "learning_rate": 7.809485160732744e-06, + "loss": 0.8856, + "step": 3389 + }, + { + "epoch": 1.7228892700590814, + "grad_norm": 3.349729299545288, + "learning_rate": 7.80809425591253e-06, + "loss": 0.8321, + "step": 3390 + }, + { + "epoch": 1.7233974969824026, + "grad_norm": 3.1980979442596436, + "learning_rate": 7.806703033595979e-06, + "loss": 0.851, + "step": 3391 + }, + { + "epoch": 1.723905723905724, + "grad_norm": 3.113279342651367, + "learning_rate": 7.805311493940382e-06, + "loss": 0.8821, + "step": 3392 + }, + { + "epoch": 1.724413950829045, + "grad_norm": 3.150865316390991, + "learning_rate": 7.803919637103077e-06, + "loss": 0.8508, + "step": 3393 + }, + { + "epoch": 1.7249221777523664, + "grad_norm": 3.0096330642700195, + "learning_rate": 7.802527463241432e-06, + "loss": 0.7343, + "step": 3394 + }, + { + "epoch": 1.7254304046756876, + "grad_norm": 3.2845497131347656, + "learning_rate": 7.801134972512856e-06, + "loss": 0.8722, + "step": 3395 + }, + { + "epoch": 1.7259386315990088, + "grad_norm": 2.9541282653808594, + "learning_rate": 7.799742165074784e-06, + "loss": 0.7932, + "step": 3396 + }, + { + "epoch": 1.7264468585223303, + "grad_norm": 3.258234977722168, + "learning_rate": 7.798349041084701e-06, + "loss": 0.9281, + "step": 3397 + }, + { + "epoch": 1.7269550854456515, + "grad_norm": 2.8658859729766846, + "learning_rate": 7.796955600700115e-06, + "loss": 0.8579, + "step": 3398 + }, + { + "epoch": 1.7274633123689727, + "grad_norm": 3.0659303665161133, + "learning_rate": 7.795561844078578e-06, + "loss": 0.8582, + "step": 3399 + }, + { + "epoch": 1.727971539292294, + "grad_norm": 3.235898733139038, + "learning_rate": 7.794167771377672e-06, + "loss": 0.8627, + "step": 3400 + }, + { + "epoch": 1.7284797662156153, + "grad_norm": 3.0602004528045654, + "learning_rate": 7.792773382755021e-06, + "loss": 0.849, + "step": 3401 + }, + { + "epoch": 1.7289879931389365, + "grad_norm": 3.159080743789673, + "learning_rate": 7.791378678368278e-06, + "loss": 0.8391, + "step": 3402 + }, + { + "epoch": 1.729496220062258, + "grad_norm": 3.1424660682678223, + "learning_rate": 7.789983658375134e-06, + "loss": 0.9017, + "step": 3403 + }, + { + "epoch": 1.730004446985579, + "grad_norm": 3.1947531700134277, + "learning_rate": 7.78858832293332e-06, + "loss": 0.7713, + "step": 3404 + }, + { + "epoch": 1.7305126739089003, + "grad_norm": 3.207350969314575, + "learning_rate": 7.787192672200597e-06, + "loss": 0.8945, + "step": 3405 + }, + { + "epoch": 1.7310209008322217, + "grad_norm": 3.4544808864593506, + "learning_rate": 7.785796706334762e-06, + "loss": 0.8222, + "step": 3406 + }, + { + "epoch": 1.7315291277555427, + "grad_norm": 3.1704776287078857, + "learning_rate": 7.784400425493656e-06, + "loss": 0.8524, + "step": 3407 + }, + { + "epoch": 1.732037354678864, + "grad_norm": 3.2776436805725098, + "learning_rate": 7.783003829835142e-06, + "loss": 0.8799, + "step": 3408 + }, + { + "epoch": 1.7325455816021855, + "grad_norm": 3.104471206665039, + "learning_rate": 7.78160691951713e-06, + "loss": 0.7841, + "step": 3409 + }, + { + "epoch": 1.7330538085255065, + "grad_norm": 3.2252237796783447, + "learning_rate": 7.780209694697558e-06, + "loss": 0.8334, + "step": 3410 + }, + { + "epoch": 1.733562035448828, + "grad_norm": 2.9332568645477295, + "learning_rate": 7.778812155534406e-06, + "loss": 0.8084, + "step": 3411 + }, + { + "epoch": 1.7340702623721491, + "grad_norm": 3.004978895187378, + "learning_rate": 7.777414302185683e-06, + "loss": 0.8543, + "step": 3412 + }, + { + "epoch": 1.7345784892954703, + "grad_norm": 3.2775914669036865, + "learning_rate": 7.776016134809439e-06, + "loss": 0.8399, + "step": 3413 + }, + { + "epoch": 1.7350867162187917, + "grad_norm": 2.82473087310791, + "learning_rate": 7.77461765356376e-06, + "loss": 0.7478, + "step": 3414 + }, + { + "epoch": 1.735594943142113, + "grad_norm": 3.2043254375457764, + "learning_rate": 7.77321885860676e-06, + "loss": 0.8112, + "step": 3415 + }, + { + "epoch": 1.7361031700654341, + "grad_norm": 3.1789662837982178, + "learning_rate": 7.771819750096594e-06, + "loss": 0.7874, + "step": 3416 + }, + { + "epoch": 1.7366113969887556, + "grad_norm": 3.2129077911376953, + "learning_rate": 7.770420328191454e-06, + "loss": 0.8202, + "step": 3417 + }, + { + "epoch": 1.7371196239120767, + "grad_norm": 3.1670689582824707, + "learning_rate": 7.769020593049565e-06, + "loss": 0.8352, + "step": 3418 + }, + { + "epoch": 1.737627850835398, + "grad_norm": 3.1509406566619873, + "learning_rate": 7.767620544829186e-06, + "loss": 0.7717, + "step": 3419 + }, + { + "epoch": 1.7381360777587194, + "grad_norm": 3.288256883621216, + "learning_rate": 7.766220183688615e-06, + "loss": 0.909, + "step": 3420 + }, + { + "epoch": 1.7386443046820406, + "grad_norm": 3.142703056335449, + "learning_rate": 7.76481950978618e-06, + "loss": 0.9001, + "step": 3421 + }, + { + "epoch": 1.7391525316053618, + "grad_norm": 3.0902483463287354, + "learning_rate": 7.763418523280253e-06, + "loss": 0.8006, + "step": 3422 + }, + { + "epoch": 1.7396607585286832, + "grad_norm": 3.544025421142578, + "learning_rate": 7.762017224329233e-06, + "loss": 0.8711, + "step": 3423 + }, + { + "epoch": 1.7401689854520042, + "grad_norm": 3.133329153060913, + "learning_rate": 7.760615613091557e-06, + "loss": 0.8377, + "step": 3424 + }, + { + "epoch": 1.7406772123753256, + "grad_norm": 3.357799530029297, + "learning_rate": 7.759213689725699e-06, + "loss": 0.8351, + "step": 3425 + }, + { + "epoch": 1.741185439298647, + "grad_norm": 2.8933751583099365, + "learning_rate": 7.757811454390168e-06, + "loss": 0.8533, + "step": 3426 + }, + { + "epoch": 1.741693666221968, + "grad_norm": 2.9360575675964355, + "learning_rate": 7.756408907243503e-06, + "loss": 0.8728, + "step": 3427 + }, + { + "epoch": 1.7422018931452894, + "grad_norm": 3.189209461212158, + "learning_rate": 7.755006048444287e-06, + "loss": 0.911, + "step": 3428 + }, + { + "epoch": 1.7427101200686106, + "grad_norm": 3.846020460128784, + "learning_rate": 7.753602878151132e-06, + "loss": 0.9189, + "step": 3429 + }, + { + "epoch": 1.7432183469919318, + "grad_norm": 2.9996988773345947, + "learning_rate": 7.752199396522688e-06, + "loss": 0.7928, + "step": 3430 + }, + { + "epoch": 1.7437265739152532, + "grad_norm": 3.2458527088165283, + "learning_rate": 7.750795603717637e-06, + "loss": 0.8081, + "step": 3431 + }, + { + "epoch": 1.7442348008385744, + "grad_norm": 3.339367151260376, + "learning_rate": 7.749391499894701e-06, + "loss": 0.8459, + "step": 3432 + }, + { + "epoch": 1.7447430277618956, + "grad_norm": 3.1276707649230957, + "learning_rate": 7.747987085212633e-06, + "loss": 0.8501, + "step": 3433 + }, + { + "epoch": 1.745251254685217, + "grad_norm": 3.230774164199829, + "learning_rate": 7.746582359830223e-06, + "loss": 0.9113, + "step": 3434 + }, + { + "epoch": 1.7457594816085382, + "grad_norm": 2.9944803714752197, + "learning_rate": 7.745177323906297e-06, + "loss": 0.815, + "step": 3435 + }, + { + "epoch": 1.7462677085318594, + "grad_norm": 3.396505117416382, + "learning_rate": 7.743771977599714e-06, + "loss": 0.8726, + "step": 3436 + }, + { + "epoch": 1.7467759354551808, + "grad_norm": 3.2798049449920654, + "learning_rate": 7.74236632106937e-06, + "loss": 0.8442, + "step": 3437 + }, + { + "epoch": 1.747284162378502, + "grad_norm": 3.106595039367676, + "learning_rate": 7.740960354474191e-06, + "loss": 0.8201, + "step": 3438 + }, + { + "epoch": 1.7477923893018232, + "grad_norm": 3.378309726715088, + "learning_rate": 7.73955407797315e-06, + "loss": 0.8769, + "step": 3439 + }, + { + "epoch": 1.7483006162251447, + "grad_norm": 3.187196731567383, + "learning_rate": 7.73814749172524e-06, + "loss": 0.8586, + "step": 3440 + }, + { + "epoch": 1.7488088431484656, + "grad_norm": 3.2755212783813477, + "learning_rate": 7.736740595889499e-06, + "loss": 0.7788, + "step": 3441 + }, + { + "epoch": 1.749317070071787, + "grad_norm": 3.3902981281280518, + "learning_rate": 7.735333390624999e-06, + "loss": 0.9026, + "step": 3442 + }, + { + "epoch": 1.7498252969951085, + "grad_norm": 3.0064620971679688, + "learning_rate": 7.733925876090842e-06, + "loss": 0.8739, + "step": 3443 + }, + { + "epoch": 1.7503335239184294, + "grad_norm": 3.1249990463256836, + "learning_rate": 7.73251805244617e-06, + "loss": 0.893, + "step": 3444 + }, + { + "epoch": 1.7508417508417509, + "grad_norm": 3.122293710708618, + "learning_rate": 7.731109919850156e-06, + "loss": 0.7919, + "step": 3445 + }, + { + "epoch": 1.751349977765072, + "grad_norm": 3.1727752685546875, + "learning_rate": 7.729701478462014e-06, + "loss": 0.8264, + "step": 3446 + }, + { + "epoch": 1.7518582046883933, + "grad_norm": 3.2961251735687256, + "learning_rate": 7.728292728440987e-06, + "loss": 0.7647, + "step": 3447 + }, + { + "epoch": 1.7523664316117147, + "grad_norm": 3.3101634979248047, + "learning_rate": 7.726883669946355e-06, + "loss": 0.9487, + "step": 3448 + }, + { + "epoch": 1.7528746585350359, + "grad_norm": 3.055027484893799, + "learning_rate": 7.725474303137432e-06, + "loss": 0.8389, + "step": 3449 + }, + { + "epoch": 1.753382885458357, + "grad_norm": 3.1277880668640137, + "learning_rate": 7.724064628173568e-06, + "loss": 0.8013, + "step": 3450 + }, + { + "epoch": 1.7538911123816785, + "grad_norm": 3.3328499794006348, + "learning_rate": 7.722654645214148e-06, + "loss": 0.9683, + "step": 3451 + }, + { + "epoch": 1.7543993393049997, + "grad_norm": 3.0421502590179443, + "learning_rate": 7.72124435441859e-06, + "loss": 0.8509, + "step": 3452 + }, + { + "epoch": 1.754907566228321, + "grad_norm": 3.255542516708374, + "learning_rate": 7.719833755946352e-06, + "loss": 0.8878, + "step": 3453 + }, + { + "epoch": 1.7554157931516423, + "grad_norm": 3.13769268989563, + "learning_rate": 7.718422849956918e-06, + "loss": 0.8319, + "step": 3454 + }, + { + "epoch": 1.7559240200749635, + "grad_norm": 3.3945152759552, + "learning_rate": 7.717011636609815e-06, + "loss": 0.8114, + "step": 3455 + }, + { + "epoch": 1.7564322469982847, + "grad_norm": 3.2403454780578613, + "learning_rate": 7.7156001160646e-06, + "loss": 0.8258, + "step": 3456 + }, + { + "epoch": 1.7569404739216061, + "grad_norm": 3.01177978515625, + "learning_rate": 7.714188288480864e-06, + "loss": 0.7997, + "step": 3457 + }, + { + "epoch": 1.757448700844927, + "grad_norm": 3.2744243144989014, + "learning_rate": 7.712776154018238e-06, + "loss": 0.897, + "step": 3458 + }, + { + "epoch": 1.7579569277682485, + "grad_norm": 3.0223116874694824, + "learning_rate": 7.711363712836387e-06, + "loss": 0.8106, + "step": 3459 + }, + { + "epoch": 1.75846515469157, + "grad_norm": 3.2434840202331543, + "learning_rate": 7.709950965095e-06, + "loss": 0.8571, + "step": 3460 + }, + { + "epoch": 1.758973381614891, + "grad_norm": 3.1417956352233887, + "learning_rate": 7.708537910953818e-06, + "loss": 0.9404, + "step": 3461 + }, + { + "epoch": 1.7594816085382123, + "grad_norm": 3.2836475372314453, + "learning_rate": 7.7071245505726e-06, + "loss": 0.8172, + "step": 3462 + }, + { + "epoch": 1.7599898354615335, + "grad_norm": 3.0664286613464355, + "learning_rate": 7.705710884111153e-06, + "loss": 0.8509, + "step": 3463 + }, + { + "epoch": 1.7604980623848547, + "grad_norm": 2.844975233078003, + "learning_rate": 7.70429691172931e-06, + "loss": 0.7531, + "step": 3464 + }, + { + "epoch": 1.7610062893081762, + "grad_norm": 3.3454537391662598, + "learning_rate": 7.702882633586941e-06, + "loss": 0.8593, + "step": 3465 + }, + { + "epoch": 1.7615145162314974, + "grad_norm": 3.070310115814209, + "learning_rate": 7.701468049843952e-06, + "loss": 0.9028, + "step": 3466 + }, + { + "epoch": 1.7620227431548185, + "grad_norm": 3.2803428173065186, + "learning_rate": 7.70005316066028e-06, + "loss": 0.7379, + "step": 3467 + }, + { + "epoch": 1.76253097007814, + "grad_norm": 3.622762680053711, + "learning_rate": 7.698637966195906e-06, + "loss": 0.9147, + "step": 3468 + }, + { + "epoch": 1.7630391970014612, + "grad_norm": 2.88554048538208, + "learning_rate": 7.69722246661083e-06, + "loss": 0.7526, + "step": 3469 + }, + { + "epoch": 1.7635474239247824, + "grad_norm": 3.2611470222473145, + "learning_rate": 7.6958066620651e-06, + "loss": 0.838, + "step": 3470 + }, + { + "epoch": 1.7640556508481038, + "grad_norm": 3.031313896179199, + "learning_rate": 7.694390552718791e-06, + "loss": 0.8521, + "step": 3471 + }, + { + "epoch": 1.764563877771425, + "grad_norm": 3.072566509246826, + "learning_rate": 7.692974138732018e-06, + "loss": 0.8519, + "step": 3472 + }, + { + "epoch": 1.7650721046947462, + "grad_norm": 3.1689980030059814, + "learning_rate": 7.691557420264926e-06, + "loss": 0.793, + "step": 3473 + }, + { + "epoch": 1.7655803316180676, + "grad_norm": 3.405853033065796, + "learning_rate": 7.690140397477694e-06, + "loss": 0.792, + "step": 3474 + }, + { + "epoch": 1.7660885585413886, + "grad_norm": 3.279622793197632, + "learning_rate": 7.688723070530539e-06, + "loss": 0.8657, + "step": 3475 + }, + { + "epoch": 1.76659678546471, + "grad_norm": 3.1858105659484863, + "learning_rate": 7.68730543958371e-06, + "loss": 0.8702, + "step": 3476 + }, + { + "epoch": 1.7671050123880314, + "grad_norm": 3.201594114303589, + "learning_rate": 7.685887504797494e-06, + "loss": 0.8724, + "step": 3477 + }, + { + "epoch": 1.7676132393113524, + "grad_norm": 3.152366876602173, + "learning_rate": 7.684469266332205e-06, + "loss": 0.7965, + "step": 3478 + }, + { + "epoch": 1.7681214662346738, + "grad_norm": 3.1901934146881104, + "learning_rate": 7.683050724348196e-06, + "loss": 0.8763, + "step": 3479 + }, + { + "epoch": 1.768629693157995, + "grad_norm": 3.3099849224090576, + "learning_rate": 7.681631879005857e-06, + "loss": 0.8521, + "step": 3480 + }, + { + "epoch": 1.7691379200813162, + "grad_norm": 3.154052257537842, + "learning_rate": 7.680212730465609e-06, + "loss": 0.9154, + "step": 3481 + }, + { + "epoch": 1.7696461470046376, + "grad_norm": 3.3573923110961914, + "learning_rate": 7.678793278887906e-06, + "loss": 0.8304, + "step": 3482 + }, + { + "epoch": 1.7701543739279588, + "grad_norm": 3.297215461730957, + "learning_rate": 7.677373524433238e-06, + "loss": 0.8368, + "step": 3483 + }, + { + "epoch": 1.77066260085128, + "grad_norm": 3.335425853729248, + "learning_rate": 7.67595346726213e-06, + "loss": 0.8798, + "step": 3484 + }, + { + "epoch": 1.7711708277746014, + "grad_norm": 2.9975199699401855, + "learning_rate": 7.674533107535138e-06, + "loss": 0.8346, + "step": 3485 + }, + { + "epoch": 1.7716790546979226, + "grad_norm": 3.0628726482391357, + "learning_rate": 7.673112445412859e-06, + "loss": 0.8318, + "step": 3486 + }, + { + "epoch": 1.7721872816212438, + "grad_norm": 3.0613350868225098, + "learning_rate": 7.671691481055915e-06, + "loss": 0.8484, + "step": 3487 + }, + { + "epoch": 1.7726955085445653, + "grad_norm": 3.252533435821533, + "learning_rate": 7.67027021462497e-06, + "loss": 0.8594, + "step": 3488 + }, + { + "epoch": 1.7732037354678865, + "grad_norm": 3.155071496963501, + "learning_rate": 7.668848646280718e-06, + "loss": 0.7437, + "step": 3489 + }, + { + "epoch": 1.7737119623912077, + "grad_norm": 3.096879005432129, + "learning_rate": 7.667426776183888e-06, + "loss": 0.7902, + "step": 3490 + }, + { + "epoch": 1.774220189314529, + "grad_norm": 3.074460744857788, + "learning_rate": 7.666004604495243e-06, + "loss": 0.8088, + "step": 3491 + }, + { + "epoch": 1.77472841623785, + "grad_norm": 3.132429599761963, + "learning_rate": 7.664582131375581e-06, + "loss": 0.81, + "step": 3492 + }, + { + "epoch": 1.7752366431611715, + "grad_norm": 3.136418581008911, + "learning_rate": 7.663159356985736e-06, + "loss": 0.9542, + "step": 3493 + }, + { + "epoch": 1.7757448700844929, + "grad_norm": 3.1513595581054688, + "learning_rate": 7.661736281486568e-06, + "loss": 0.8895, + "step": 3494 + }, + { + "epoch": 1.7762530970078139, + "grad_norm": 3.2499263286590576, + "learning_rate": 7.660312905038983e-06, + "loss": 0.9252, + "step": 3495 + }, + { + "epoch": 1.7767613239311353, + "grad_norm": 3.060739040374756, + "learning_rate": 7.65888922780391e-06, + "loss": 0.8141, + "step": 3496 + }, + { + "epoch": 1.7772695508544565, + "grad_norm": 3.1161351203918457, + "learning_rate": 7.657465249942318e-06, + "loss": 0.9581, + "step": 3497 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 3.0054283142089844, + "learning_rate": 7.656040971615209e-06, + "loss": 0.8671, + "step": 3498 + }, + { + "epoch": 1.778286004701099, + "grad_norm": 3.2062299251556396, + "learning_rate": 7.654616392983616e-06, + "loss": 0.8475, + "step": 3499 + }, + { + "epoch": 1.7787942316244203, + "grad_norm": 3.0939881801605225, + "learning_rate": 7.653191514208612e-06, + "loss": 0.8605, + "step": 3500 + }, + { + "epoch": 1.7787942316244203, + "eval_loss": 1.2510522603988647, + "eval_runtime": 14.8176, + "eval_samples_per_second": 26.995, + "eval_steps_per_second": 3.374, + "step": 3500 + } + ], + "logging_steps": 1.0, + "max_steps": 9835, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8214743984989798e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}