{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3507, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00028514399771884804, "grad_norm": 73.35986033232847, "learning_rate": 9.433962264150944e-08, "loss": 0.9276, "step": 1 }, { "epoch": 0.0005702879954376961, "grad_norm": 258.36563461213944, "learning_rate": 1.886792452830189e-07, "loss": 0.9933, "step": 2 }, { "epoch": 0.000855431993156544, "grad_norm": 166.05860991922756, "learning_rate": 2.8301886792452833e-07, "loss": 0.8841, "step": 3 }, { "epoch": 0.0011405759908753922, "grad_norm": 80.04075689751123, "learning_rate": 3.773584905660378e-07, "loss": 0.8703, "step": 4 }, { "epoch": 0.00142571998859424, "grad_norm": 94.13324251180957, "learning_rate": 4.716981132075472e-07, "loss": 0.7985, "step": 5 }, { "epoch": 0.001710863986313088, "grad_norm": 95.32563753359732, "learning_rate": 5.660377358490567e-07, "loss": 0.8165, "step": 6 }, { "epoch": 0.001996007984031936, "grad_norm": 107.85429854364156, "learning_rate": 6.603773584905661e-07, "loss": 0.7712, "step": 7 }, { "epoch": 0.0022811519817507843, "grad_norm": 81.4209081472541, "learning_rate": 7.547169811320755e-07, "loss": 0.8074, "step": 8 }, { "epoch": 0.0025662959794696323, "grad_norm": 102.25957565902327, "learning_rate": 8.490566037735849e-07, "loss": 0.71, "step": 9 }, { "epoch": 0.00285143997718848, "grad_norm": 93.50952676307716, "learning_rate": 9.433962264150944e-07, "loss": 0.5691, "step": 10 }, { "epoch": 0.003136583974907328, "grad_norm": 28.59917240226574, "learning_rate": 1.037735849056604e-06, "loss": 0.2667, "step": 11 }, { "epoch": 0.003421727972626176, "grad_norm": 31.65592294166331, "learning_rate": 1.1320754716981133e-06, "loss": 0.1884, "step": 12 }, { "epoch": 0.0037068719703450244, "grad_norm": 19.767941232025557, "learning_rate": 1.2264150943396227e-06, "loss": 0.1949, "step": 13 }, { "epoch": 0.003992015968063872, "grad_norm": 18.679288467520557, "learning_rate": 1.3207547169811322e-06, "loss": 0.1665, "step": 14 }, { "epoch": 0.00427715996578272, "grad_norm": 11.019999790672726, "learning_rate": 1.4150943396226415e-06, "loss": 0.083, "step": 15 }, { "epoch": 0.004562303963501569, "grad_norm": 10.59204980956432, "learning_rate": 1.509433962264151e-06, "loss": 0.1157, "step": 16 }, { "epoch": 0.004847447961220416, "grad_norm": 3.1617167080707995, "learning_rate": 1.6037735849056604e-06, "loss": 0.0322, "step": 17 }, { "epoch": 0.0051325919589392645, "grad_norm": 4.146835741088157, "learning_rate": 1.6981132075471698e-06, "loss": 0.0497, "step": 18 }, { "epoch": 0.005417735956658112, "grad_norm": 7.254151558151055, "learning_rate": 1.7924528301886793e-06, "loss": 0.0879, "step": 19 }, { "epoch": 0.00570287995437696, "grad_norm": 42.70760196636873, "learning_rate": 1.8867924528301889e-06, "loss": 0.1596, "step": 20 }, { "epoch": 0.005988023952095809, "grad_norm": 9.554484439254637, "learning_rate": 1.981132075471698e-06, "loss": 0.1138, "step": 21 }, { "epoch": 0.006273167949814656, "grad_norm": 7.238913126784841, "learning_rate": 2.075471698113208e-06, "loss": 0.0812, "step": 22 }, { "epoch": 0.006558311947533505, "grad_norm": 4.259308808989036, "learning_rate": 2.1698113207547173e-06, "loss": 0.0413, "step": 23 }, { "epoch": 0.006843455945252352, "grad_norm": 6.566396501150489, "learning_rate": 2.2641509433962266e-06, "loss": 0.0549, "step": 24 }, { "epoch": 0.0071285999429712005, "grad_norm": 5.066416110741747, "learning_rate": 2.358490566037736e-06, "loss": 0.037, "step": 25 }, { "epoch": 0.007413743940690049, "grad_norm": 4.723721752248351, "learning_rate": 2.4528301886792453e-06, "loss": 0.0585, "step": 26 }, { "epoch": 0.007698887938408896, "grad_norm": 2.560432972171629, "learning_rate": 2.547169811320755e-06, "loss": 0.0371, "step": 27 }, { "epoch": 0.007984031936127744, "grad_norm": 4.2665726201781276, "learning_rate": 2.6415094339622644e-06, "loss": 0.0507, "step": 28 }, { "epoch": 0.008269175933846592, "grad_norm": 4.63734681578259, "learning_rate": 2.7358490566037738e-06, "loss": 0.0511, "step": 29 }, { "epoch": 0.00855431993156544, "grad_norm": 2.5768861044123095, "learning_rate": 2.830188679245283e-06, "loss": 0.0357, "step": 30 }, { "epoch": 0.008839463929284289, "grad_norm": 3.634984231769664, "learning_rate": 2.9245283018867924e-06, "loss": 0.0625, "step": 31 }, { "epoch": 0.009124607927003137, "grad_norm": 4.840413439326456, "learning_rate": 3.018867924528302e-06, "loss": 0.0414, "step": 32 }, { "epoch": 0.009409751924721984, "grad_norm": 9.287836956870658, "learning_rate": 3.1132075471698115e-06, "loss": 0.0882, "step": 33 }, { "epoch": 0.009694895922440832, "grad_norm": 6.973138828619164, "learning_rate": 3.207547169811321e-06, "loss": 0.0593, "step": 34 }, { "epoch": 0.00998003992015968, "grad_norm": 3.6359531137785113, "learning_rate": 3.30188679245283e-06, "loss": 0.0335, "step": 35 }, { "epoch": 0.010265183917878529, "grad_norm": 3.4832523392273527, "learning_rate": 3.3962264150943395e-06, "loss": 0.0621, "step": 36 }, { "epoch": 0.010550327915597377, "grad_norm": 6.152559798979658, "learning_rate": 3.4905660377358493e-06, "loss": 0.0768, "step": 37 }, { "epoch": 0.010835471913316224, "grad_norm": 3.142333726843115, "learning_rate": 3.5849056603773586e-06, "loss": 0.0421, "step": 38 }, { "epoch": 0.011120615911035072, "grad_norm": 4.4376556128819615, "learning_rate": 3.679245283018868e-06, "loss": 0.0618, "step": 39 }, { "epoch": 0.01140575990875392, "grad_norm": 3.119036132535718, "learning_rate": 3.7735849056603777e-06, "loss": 0.0395, "step": 40 }, { "epoch": 0.011690903906472769, "grad_norm": 2.8819748677454564, "learning_rate": 3.8679245283018875e-06, "loss": 0.036, "step": 41 }, { "epoch": 0.011976047904191617, "grad_norm": 3.0285102030013147, "learning_rate": 3.962264150943396e-06, "loss": 0.0451, "step": 42 }, { "epoch": 0.012261191901910464, "grad_norm": 2.4198586652875713, "learning_rate": 4.056603773584906e-06, "loss": 0.0359, "step": 43 }, { "epoch": 0.012546335899629312, "grad_norm": 4.01556344965596, "learning_rate": 4.150943396226416e-06, "loss": 0.0654, "step": 44 }, { "epoch": 0.01283147989734816, "grad_norm": 4.152755564375582, "learning_rate": 4.245283018867925e-06, "loss": 0.0688, "step": 45 }, { "epoch": 0.01311662389506701, "grad_norm": 5.650506818749513, "learning_rate": 4.339622641509435e-06, "loss": 0.0621, "step": 46 }, { "epoch": 0.013401767892785858, "grad_norm": 4.772858263739585, "learning_rate": 4.4339622641509435e-06, "loss": 0.043, "step": 47 }, { "epoch": 0.013686911890504704, "grad_norm": 3.281056276083866, "learning_rate": 4.528301886792453e-06, "loss": 0.0409, "step": 48 }, { "epoch": 0.013972055888223553, "grad_norm": 2.665340695119781, "learning_rate": 4.622641509433963e-06, "loss": 0.0261, "step": 49 }, { "epoch": 0.014257199885942401, "grad_norm": 2.1051762190060757, "learning_rate": 4.716981132075472e-06, "loss": 0.046, "step": 50 }, { "epoch": 0.01454234388366125, "grad_norm": 4.140757787048907, "learning_rate": 4.811320754716982e-06, "loss": 0.0399, "step": 51 }, { "epoch": 0.014827487881380098, "grad_norm": 4.806224421900785, "learning_rate": 4.905660377358491e-06, "loss": 0.0733, "step": 52 }, { "epoch": 0.015112631879098944, "grad_norm": 5.244037355061446, "learning_rate": 5e-06, "loss": 0.0829, "step": 53 }, { "epoch": 0.015397775876817793, "grad_norm": 2.529956827555466, "learning_rate": 5.09433962264151e-06, "loss": 0.0414, "step": 54 }, { "epoch": 0.01568291987453664, "grad_norm": 3.1547732507353037, "learning_rate": 5.188679245283019e-06, "loss": 0.0492, "step": 55 }, { "epoch": 0.015968063872255488, "grad_norm": 3.6253749901832553, "learning_rate": 5.283018867924529e-06, "loss": 0.0409, "step": 56 }, { "epoch": 0.016253207869974338, "grad_norm": 8.40170080651301, "learning_rate": 5.377358490566038e-06, "loss": 0.0788, "step": 57 }, { "epoch": 0.016538351867693184, "grad_norm": 2.2891004276086444, "learning_rate": 5.4716981132075475e-06, "loss": 0.0452, "step": 58 }, { "epoch": 0.016823495865412035, "grad_norm": 1.968563607614644, "learning_rate": 5.566037735849057e-06, "loss": 0.0374, "step": 59 }, { "epoch": 0.01710863986313088, "grad_norm": 2.2826667099920424, "learning_rate": 5.660377358490566e-06, "loss": 0.0318, "step": 60 }, { "epoch": 0.017393783860849728, "grad_norm": 3.075829515844115, "learning_rate": 5.754716981132076e-06, "loss": 0.0456, "step": 61 }, { "epoch": 0.017678927858568578, "grad_norm": 2.641793301245682, "learning_rate": 5.849056603773585e-06, "loss": 0.0382, "step": 62 }, { "epoch": 0.017964071856287425, "grad_norm": 3.0357867774553275, "learning_rate": 5.943396226415095e-06, "loss": 0.0391, "step": 63 }, { "epoch": 0.018249215854006275, "grad_norm": 1.3459497638743567, "learning_rate": 6.037735849056604e-06, "loss": 0.0192, "step": 64 }, { "epoch": 0.01853435985172512, "grad_norm": 0.8896089027285865, "learning_rate": 6.132075471698113e-06, "loss": 0.026, "step": 65 }, { "epoch": 0.018819503849443968, "grad_norm": 17.332060998609155, "learning_rate": 6.226415094339623e-06, "loss": 0.0781, "step": 66 }, { "epoch": 0.019104647847162818, "grad_norm": 2.071064848030776, "learning_rate": 6.320754716981132e-06, "loss": 0.0431, "step": 67 }, { "epoch": 0.019389791844881665, "grad_norm": 4.062997719292194, "learning_rate": 6.415094339622642e-06, "loss": 0.0354, "step": 68 }, { "epoch": 0.019674935842600515, "grad_norm": 2.350390848534792, "learning_rate": 6.5094339622641515e-06, "loss": 0.0274, "step": 69 }, { "epoch": 0.01996007984031936, "grad_norm": 2.566245599377888, "learning_rate": 6.60377358490566e-06, "loss": 0.0699, "step": 70 }, { "epoch": 0.020245223838038208, "grad_norm": 3.5006687012241122, "learning_rate": 6.69811320754717e-06, "loss": 0.0684, "step": 71 }, { "epoch": 0.020530367835757058, "grad_norm": 4.243644946589001, "learning_rate": 6.792452830188679e-06, "loss": 0.0957, "step": 72 }, { "epoch": 0.020815511833475905, "grad_norm": 1.5269476114061467, "learning_rate": 6.886792452830189e-06, "loss": 0.0372, "step": 73 }, { "epoch": 0.021100655831194755, "grad_norm": 41.74516371814436, "learning_rate": 6.981132075471699e-06, "loss": 0.2785, "step": 74 }, { "epoch": 0.0213857998289136, "grad_norm": 3.8783540833989076, "learning_rate": 7.0754716981132075e-06, "loss": 0.0375, "step": 75 }, { "epoch": 0.021670943826632448, "grad_norm": 4.643543581746758, "learning_rate": 7.169811320754717e-06, "loss": 0.036, "step": 76 }, { "epoch": 0.021956087824351298, "grad_norm": 4.600001137811726, "learning_rate": 7.264150943396226e-06, "loss": 0.0636, "step": 77 }, { "epoch": 0.022241231822070145, "grad_norm": 3.128559332322622, "learning_rate": 7.358490566037736e-06, "loss": 0.0429, "step": 78 }, { "epoch": 0.022526375819788995, "grad_norm": 2.8641659687703678, "learning_rate": 7.452830188679246e-06, "loss": 0.042, "step": 79 }, { "epoch": 0.02281151981750784, "grad_norm": 12.271691655074866, "learning_rate": 7.5471698113207555e-06, "loss": 0.0862, "step": 80 }, { "epoch": 0.023096663815226688, "grad_norm": 5.5069227290477665, "learning_rate": 7.641509433962266e-06, "loss": 0.0403, "step": 81 }, { "epoch": 0.023381807812945538, "grad_norm": 1.4595795029247436, "learning_rate": 7.735849056603775e-06, "loss": 0.0266, "step": 82 }, { "epoch": 0.023666951810664385, "grad_norm": 8.876102538357202, "learning_rate": 7.830188679245284e-06, "loss": 0.0521, "step": 83 }, { "epoch": 0.023952095808383235, "grad_norm": 1.6782963289907105, "learning_rate": 7.924528301886793e-06, "loss": 0.0326, "step": 84 }, { "epoch": 0.02423723980610208, "grad_norm": 2.681755314762461, "learning_rate": 8.018867924528303e-06, "loss": 0.0839, "step": 85 }, { "epoch": 0.024522383803820928, "grad_norm": 5.714032889612906, "learning_rate": 8.113207547169812e-06, "loss": 0.0316, "step": 86 }, { "epoch": 0.02480752780153978, "grad_norm": 5.028130886749829, "learning_rate": 8.207547169811321e-06, "loss": 0.0731, "step": 87 }, { "epoch": 0.025092671799258625, "grad_norm": 3.0570511261289917, "learning_rate": 8.301886792452832e-06, "loss": 0.0333, "step": 88 }, { "epoch": 0.025377815796977475, "grad_norm": 5.146547952949438, "learning_rate": 8.39622641509434e-06, "loss": 0.0514, "step": 89 }, { "epoch": 0.02566295979469632, "grad_norm": 4.7531766250387255, "learning_rate": 8.49056603773585e-06, "loss": 0.0527, "step": 90 }, { "epoch": 0.02594810379241517, "grad_norm": 4.335096216573719, "learning_rate": 8.58490566037736e-06, "loss": 0.0268, "step": 91 }, { "epoch": 0.02623324779013402, "grad_norm": 2.132428089649692, "learning_rate": 8.67924528301887e-06, "loss": 0.0168, "step": 92 }, { "epoch": 0.026518391787852865, "grad_norm": 6.833117952820334, "learning_rate": 8.773584905660378e-06, "loss": 0.0868, "step": 93 }, { "epoch": 0.026803535785571715, "grad_norm": 11.160832029176879, "learning_rate": 8.867924528301887e-06, "loss": 0.0925, "step": 94 }, { "epoch": 0.027088679783290562, "grad_norm": 1.8890461785442043, "learning_rate": 8.962264150943398e-06, "loss": 0.0376, "step": 95 }, { "epoch": 0.02737382378100941, "grad_norm": 2.0039805311692573, "learning_rate": 9.056603773584907e-06, "loss": 0.0534, "step": 96 }, { "epoch": 0.02765896777872826, "grad_norm": 3.256805619531125, "learning_rate": 9.150943396226416e-06, "loss": 0.0476, "step": 97 }, { "epoch": 0.027944111776447105, "grad_norm": 2.3004970717202595, "learning_rate": 9.245283018867926e-06, "loss": 0.061, "step": 98 }, { "epoch": 0.028229255774165955, "grad_norm": 1.8448990035600177, "learning_rate": 9.339622641509435e-06, "loss": 0.0422, "step": 99 }, { "epoch": 0.028514399771884802, "grad_norm": 4.002283614106026, "learning_rate": 9.433962264150944e-06, "loss": 0.0603, "step": 100 }, { "epoch": 0.02879954376960365, "grad_norm": 8.180190475095822, "learning_rate": 9.528301886792455e-06, "loss": 0.1227, "step": 101 }, { "epoch": 0.0290846877673225, "grad_norm": 6.4834435384859015, "learning_rate": 9.622641509433963e-06, "loss": 0.0903, "step": 102 }, { "epoch": 0.029369831765041345, "grad_norm": 6.419140173971885, "learning_rate": 9.716981132075472e-06, "loss": 0.0973, "step": 103 }, { "epoch": 0.029654975762760195, "grad_norm": 5.167609682986552, "learning_rate": 9.811320754716981e-06, "loss": 0.0464, "step": 104 }, { "epoch": 0.029940119760479042, "grad_norm": 2.6286695618463733, "learning_rate": 9.905660377358492e-06, "loss": 0.047, "step": 105 }, { "epoch": 0.03022526375819789, "grad_norm": 2.385355633734913, "learning_rate": 1e-05, "loss": 0.0366, "step": 106 }, { "epoch": 0.03051040775591674, "grad_norm": 5.045824475523923, "learning_rate": 9.999997866825128e-06, "loss": 0.0928, "step": 107 }, { "epoch": 0.030795551753635585, "grad_norm": 5.882161471784182, "learning_rate": 9.999991467302332e-06, "loss": 0.1048, "step": 108 }, { "epoch": 0.031080695751354435, "grad_norm": 8.71717621944331, "learning_rate": 9.99998080143707e-06, "loss": 0.0885, "step": 109 }, { "epoch": 0.03136583974907328, "grad_norm": 2.090876100875096, "learning_rate": 9.999965869238445e-06, "loss": 0.0592, "step": 110 }, { "epoch": 0.03165098374679213, "grad_norm": 15.890424714577145, "learning_rate": 9.999946670719197e-06, "loss": 0.0438, "step": 111 }, { "epoch": 0.031936127744510975, "grad_norm": 1.4259811755691145, "learning_rate": 9.99992320589571e-06, "loss": 0.0395, "step": 112 }, { "epoch": 0.03222127174222983, "grad_norm": 1.9327015405417511, "learning_rate": 9.999895474788003e-06, "loss": 0.0371, "step": 113 }, { "epoch": 0.032506415739948676, "grad_norm": 1.8632732706869015, "learning_rate": 9.999863477419739e-06, "loss": 0.0245, "step": 114 }, { "epoch": 0.03279155973766752, "grad_norm": 1.8917211984814235, "learning_rate": 9.99982721381822e-06, "loss": 0.0407, "step": 115 }, { "epoch": 0.03307670373538637, "grad_norm": 2.2247432119892956, "learning_rate": 9.999786684014393e-06, "loss": 0.0334, "step": 116 }, { "epoch": 0.033361847733105215, "grad_norm": 5.678799374308072, "learning_rate": 9.999741888042832e-06, "loss": 0.0646, "step": 117 }, { "epoch": 0.03364699173082407, "grad_norm": 2.2979438290144656, "learning_rate": 9.99969282594177e-06, "loss": 0.0352, "step": 118 }, { "epoch": 0.033932135728542916, "grad_norm": 3.401247202535555, "learning_rate": 9.999639497753062e-06, "loss": 0.0457, "step": 119 }, { "epoch": 0.03421727972626176, "grad_norm": 1.3220531908758983, "learning_rate": 9.999581903522214e-06, "loss": 0.0148, "step": 120 }, { "epoch": 0.03450242372398061, "grad_norm": 1.0582272674397575, "learning_rate": 9.999520043298374e-06, "loss": 0.0214, "step": 121 }, { "epoch": 0.034787567721699456, "grad_norm": 1.5349811431314222, "learning_rate": 9.99945391713432e-06, "loss": 0.0237, "step": 122 }, { "epoch": 0.03507271171941831, "grad_norm": 5.061670772936218, "learning_rate": 9.999383525086478e-06, "loss": 0.0393, "step": 123 }, { "epoch": 0.035357855717137156, "grad_norm": 3.8169991926514313, "learning_rate": 9.999308867214908e-06, "loss": 0.0622, "step": 124 }, { "epoch": 0.035642999714856, "grad_norm": 33.60130740332581, "learning_rate": 9.999229943583318e-06, "loss": 0.0982, "step": 125 }, { "epoch": 0.03592814371257485, "grad_norm": 1.7047090809257088, "learning_rate": 9.999146754259048e-06, "loss": 0.0321, "step": 126 }, { "epoch": 0.036213287710293696, "grad_norm": 2.9744010851446103, "learning_rate": 9.999059299313082e-06, "loss": 0.0207, "step": 127 }, { "epoch": 0.03649843170801255, "grad_norm": 4.240730548981665, "learning_rate": 9.998967578820042e-06, "loss": 0.0289, "step": 128 }, { "epoch": 0.036783575705731396, "grad_norm": 3.3812717287163463, "learning_rate": 9.998871592858193e-06, "loss": 0.0495, "step": 129 }, { "epoch": 0.03706871970345024, "grad_norm": 1.8193189165651151, "learning_rate": 9.998771341509434e-06, "loss": 0.0667, "step": 130 }, { "epoch": 0.03735386370116909, "grad_norm": 6.060011032912712, "learning_rate": 9.998666824859307e-06, "loss": 0.1307, "step": 131 }, { "epoch": 0.037639007698887936, "grad_norm": 2.402353091095481, "learning_rate": 9.998558042996993e-06, "loss": 0.0736, "step": 132 }, { "epoch": 0.03792415169660679, "grad_norm": 5.839543926949511, "learning_rate": 9.998444996015314e-06, "loss": 0.0945, "step": 133 }, { "epoch": 0.038209295694325636, "grad_norm": 1.4473227758832712, "learning_rate": 9.998327684010727e-06, "loss": 0.0432, "step": 134 }, { "epoch": 0.03849443969204448, "grad_norm": 2.396507073747093, "learning_rate": 9.998206107083333e-06, "loss": 0.0679, "step": 135 }, { "epoch": 0.03877958368976333, "grad_norm": 2.6450829119213073, "learning_rate": 9.998080265336867e-06, "loss": 0.0206, "step": 136 }, { "epoch": 0.039064727687482176, "grad_norm": 8.44539739688482, "learning_rate": 9.997950158878712e-06, "loss": 0.0362, "step": 137 }, { "epoch": 0.03934987168520103, "grad_norm": 3.940799243397122, "learning_rate": 9.997815787819876e-06, "loss": 0.0452, "step": 138 }, { "epoch": 0.039635015682919876, "grad_norm": 1.2577768949582078, "learning_rate": 9.997677152275019e-06, "loss": 0.029, "step": 139 }, { "epoch": 0.03992015968063872, "grad_norm": 2.172531523049211, "learning_rate": 9.997534252362432e-06, "loss": 0.0289, "step": 140 }, { "epoch": 0.04020530367835757, "grad_norm": 1.7163630652895192, "learning_rate": 9.99738708820405e-06, "loss": 0.0374, "step": 141 }, { "epoch": 0.040490447676076416, "grad_norm": 1.9849481929945414, "learning_rate": 9.99723565992544e-06, "loss": 0.0385, "step": 142 }, { "epoch": 0.04077559167379527, "grad_norm": 1.0957767923455073, "learning_rate": 9.997079967655816e-06, "loss": 0.0345, "step": 143 }, { "epoch": 0.041060735671514116, "grad_norm": 1.845401941137247, "learning_rate": 9.996920011528022e-06, "loss": 0.0298, "step": 144 }, { "epoch": 0.04134587966923296, "grad_norm": 1.709458777909131, "learning_rate": 9.996755791678544e-06, "loss": 0.023, "step": 145 }, { "epoch": 0.04163102366695181, "grad_norm": 2.416877485324215, "learning_rate": 9.996587308247507e-06, "loss": 0.041, "step": 146 }, { "epoch": 0.041916167664670656, "grad_norm": 1.8466041607808619, "learning_rate": 9.996414561378671e-06, "loss": 0.0308, "step": 147 }, { "epoch": 0.04220131166238951, "grad_norm": 2.7453406578055746, "learning_rate": 9.996237551219439e-06, "loss": 0.0805, "step": 148 }, { "epoch": 0.042486455660108356, "grad_norm": 1.3012783290944823, "learning_rate": 9.996056277920845e-06, "loss": 0.0459, "step": 149 }, { "epoch": 0.0427715996578272, "grad_norm": 1.6845595049974698, "learning_rate": 9.995870741637566e-06, "loss": 0.0206, "step": 150 }, { "epoch": 0.04305674365554605, "grad_norm": 1.9673407080566452, "learning_rate": 9.995680942527915e-06, "loss": 0.0277, "step": 151 }, { "epoch": 0.043341887653264896, "grad_norm": 1.096649599808759, "learning_rate": 9.99548688075384e-06, "loss": 0.0104, "step": 152 }, { "epoch": 0.04362703165098375, "grad_norm": 4.340215678141815, "learning_rate": 9.99528855648093e-06, "loss": 0.0782, "step": 153 }, { "epoch": 0.043912175648702596, "grad_norm": 0.962700831894613, "learning_rate": 9.995085969878408e-06, "loss": 0.0084, "step": 154 }, { "epoch": 0.04419731964642144, "grad_norm": 1.3431951362099261, "learning_rate": 9.994879121119134e-06, "loss": 0.0347, "step": 155 }, { "epoch": 0.04448246364414029, "grad_norm": 2.396590839420942, "learning_rate": 9.99466801037961e-06, "loss": 0.0518, "step": 156 }, { "epoch": 0.044767607641859136, "grad_norm": 1.87825274755581, "learning_rate": 9.994452637839964e-06, "loss": 0.0316, "step": 157 }, { "epoch": 0.04505275163957799, "grad_norm": 2.6008704806074707, "learning_rate": 9.994233003683972e-06, "loss": 0.0338, "step": 158 }, { "epoch": 0.045337895637296836, "grad_norm": 3.2329437736534476, "learning_rate": 9.994009108099038e-06, "loss": 0.0821, "step": 159 }, { "epoch": 0.04562303963501568, "grad_norm": 1.59183218675036, "learning_rate": 9.99378095127621e-06, "loss": 0.0245, "step": 160 }, { "epoch": 0.04590818363273453, "grad_norm": 1.6404471636929772, "learning_rate": 9.993548533410162e-06, "loss": 0.0188, "step": 161 }, { "epoch": 0.046193327630453376, "grad_norm": 5.826108372985289, "learning_rate": 9.993311854699214e-06, "loss": 0.0963, "step": 162 }, { "epoch": 0.04647847162817223, "grad_norm": 3.8052953300535393, "learning_rate": 9.993070915345313e-06, "loss": 0.063, "step": 163 }, { "epoch": 0.046763615625891077, "grad_norm": 3.166036914377814, "learning_rate": 9.992825715554047e-06, "loss": 0.0391, "step": 164 }, { "epoch": 0.04704875962360992, "grad_norm": 0.9851882069270612, "learning_rate": 9.992576255534637e-06, "loss": 0.0412, "step": 165 }, { "epoch": 0.04733390362132877, "grad_norm": 1.9312831934515782, "learning_rate": 9.99232253549994e-06, "loss": 0.0201, "step": 166 }, { "epoch": 0.047619047619047616, "grad_norm": 0.902848957710233, "learning_rate": 9.992064555666448e-06, "loss": 0.0235, "step": 167 }, { "epoch": 0.04790419161676647, "grad_norm": 2.4508039029457254, "learning_rate": 9.991802316254286e-06, "loss": 0.0236, "step": 168 }, { "epoch": 0.04818933561448532, "grad_norm": 2.052929331756409, "learning_rate": 9.991535817487218e-06, "loss": 0.038, "step": 169 }, { "epoch": 0.04847447961220416, "grad_norm": 1.480706200852706, "learning_rate": 9.991265059592638e-06, "loss": 0.0556, "step": 170 }, { "epoch": 0.04875962360992301, "grad_norm": 4.557244503271385, "learning_rate": 9.990990042801573e-06, "loss": 0.0669, "step": 171 }, { "epoch": 0.049044767607641856, "grad_norm": 2.087362034480544, "learning_rate": 9.990710767348692e-06, "loss": 0.0531, "step": 172 }, { "epoch": 0.04932991160536071, "grad_norm": 1.290096043611282, "learning_rate": 9.990427233472286e-06, "loss": 0.0347, "step": 173 }, { "epoch": 0.04961505560307956, "grad_norm": 4.300260420077906, "learning_rate": 9.990139441414291e-06, "loss": 0.0686, "step": 174 }, { "epoch": 0.0499001996007984, "grad_norm": 2.3277629405503455, "learning_rate": 9.989847391420268e-06, "loss": 0.0355, "step": 175 }, { "epoch": 0.05018534359851725, "grad_norm": 1.0758597876794864, "learning_rate": 9.989551083739416e-06, "loss": 0.0379, "step": 176 }, { "epoch": 0.0504704875962361, "grad_norm": 2.296501614131635, "learning_rate": 9.989250518624566e-06, "loss": 0.0463, "step": 177 }, { "epoch": 0.05075563159395495, "grad_norm": 1.3724194734047541, "learning_rate": 9.98894569633218e-06, "loss": 0.0314, "step": 178 }, { "epoch": 0.0510407755916738, "grad_norm": 1.7362884773572984, "learning_rate": 9.988636617122354e-06, "loss": 0.0585, "step": 179 }, { "epoch": 0.05132591958939264, "grad_norm": 0.7782751167457802, "learning_rate": 9.988323281258817e-06, "loss": 0.0309, "step": 180 }, { "epoch": 0.05161106358711149, "grad_norm": 2.6395856764774583, "learning_rate": 9.988005689008926e-06, "loss": 0.0502, "step": 181 }, { "epoch": 0.05189620758483034, "grad_norm": 2.978324666472321, "learning_rate": 9.987683840643679e-06, "loss": 0.0429, "step": 182 }, { "epoch": 0.05218135158254919, "grad_norm": 3.4011230134034167, "learning_rate": 9.987357736437691e-06, "loss": 0.0637, "step": 183 }, { "epoch": 0.05246649558026804, "grad_norm": 1.941822329126047, "learning_rate": 9.987027376669224e-06, "loss": 0.0315, "step": 184 }, { "epoch": 0.052751639577986884, "grad_norm": 2.8549897117814567, "learning_rate": 9.986692761620163e-06, "loss": 0.0382, "step": 185 }, { "epoch": 0.05303678357570573, "grad_norm": 2.582295889648147, "learning_rate": 9.986353891576021e-06, "loss": 0.0385, "step": 186 }, { "epoch": 0.05332192757342458, "grad_norm": 2.471870150657678, "learning_rate": 9.98601076682595e-06, "loss": 0.0322, "step": 187 }, { "epoch": 0.05360707157114343, "grad_norm": 2.6685644561553006, "learning_rate": 9.985663387662726e-06, "loss": 0.0603, "step": 188 }, { "epoch": 0.05389221556886228, "grad_norm": 2.4073947494866186, "learning_rate": 9.985311754382758e-06, "loss": 0.0362, "step": 189 }, { "epoch": 0.054177359566581124, "grad_norm": 2.4836029774438666, "learning_rate": 9.984955867286083e-06, "loss": 0.0384, "step": 190 }, { "epoch": 0.05446250356429997, "grad_norm": 2.5496037219151018, "learning_rate": 9.98459572667637e-06, "loss": 0.055, "step": 191 }, { "epoch": 0.05474764756201882, "grad_norm": 2.2290862127290585, "learning_rate": 9.984231332860914e-06, "loss": 0.03, "step": 192 }, { "epoch": 0.05503279155973767, "grad_norm": 2.0539664937082893, "learning_rate": 9.983862686150644e-06, "loss": 0.0486, "step": 193 }, { "epoch": 0.05531793555745652, "grad_norm": 1.347135991088387, "learning_rate": 9.983489786860115e-06, "loss": 0.0335, "step": 194 }, { "epoch": 0.055603079555175364, "grad_norm": 1.152598221757865, "learning_rate": 9.983112635307508e-06, "loss": 0.0417, "step": 195 }, { "epoch": 0.05588822355289421, "grad_norm": 1.1394323676423384, "learning_rate": 9.982731231814637e-06, "loss": 0.04, "step": 196 }, { "epoch": 0.05617336755061306, "grad_norm": 0.5751858615186785, "learning_rate": 9.982345576706942e-06, "loss": 0.0257, "step": 197 }, { "epoch": 0.05645851154833191, "grad_norm": 2.228495954241724, "learning_rate": 9.981955670313491e-06, "loss": 0.0244, "step": 198 }, { "epoch": 0.05674365554605076, "grad_norm": 2.1167156058410566, "learning_rate": 9.98156151296698e-06, "loss": 0.0255, "step": 199 }, { "epoch": 0.057028799543769604, "grad_norm": 1.1499351454121258, "learning_rate": 9.981163105003731e-06, "loss": 0.0399, "step": 200 }, { "epoch": 0.05731394354148845, "grad_norm": 1.0863620605603235, "learning_rate": 9.980760446763693e-06, "loss": 0.0336, "step": 201 }, { "epoch": 0.0575990875392073, "grad_norm": 1.8021997995153731, "learning_rate": 9.980353538590441e-06, "loss": 0.049, "step": 202 }, { "epoch": 0.05788423153692615, "grad_norm": 0.8060673496091526, "learning_rate": 9.97994238083118e-06, "loss": 0.02, "step": 203 }, { "epoch": 0.058169375534645, "grad_norm": 2.8431892970960226, "learning_rate": 9.97952697383674e-06, "loss": 0.0459, "step": 204 }, { "epoch": 0.058454519532363844, "grad_norm": 2.2266671109715475, "learning_rate": 9.979107317961572e-06, "loss": 0.0379, "step": 205 }, { "epoch": 0.05873966353008269, "grad_norm": 1.4259804628148744, "learning_rate": 9.978683413563755e-06, "loss": 0.0346, "step": 206 }, { "epoch": 0.05902480752780154, "grad_norm": 1.9019710115897548, "learning_rate": 9.978255261004996e-06, "loss": 0.0321, "step": 207 }, { "epoch": 0.05930995152552039, "grad_norm": 3.2763885427824433, "learning_rate": 9.977822860650626e-06, "loss": 0.0327, "step": 208 }, { "epoch": 0.05959509552323924, "grad_norm": 4.085746329437377, "learning_rate": 9.977386212869597e-06, "loss": 0.0785, "step": 209 }, { "epoch": 0.059880239520958084, "grad_norm": 1.4461803192099967, "learning_rate": 9.976945318034487e-06, "loss": 0.0372, "step": 210 }, { "epoch": 0.06016538351867693, "grad_norm": 1.6159101224346348, "learning_rate": 9.9765001765215e-06, "loss": 0.0432, "step": 211 }, { "epoch": 0.06045052751639578, "grad_norm": 2.109127926712865, "learning_rate": 9.976050788710462e-06, "loss": 0.0474, "step": 212 }, { "epoch": 0.06073567151411463, "grad_norm": 1.6172712792447292, "learning_rate": 9.97559715498482e-06, "loss": 0.0299, "step": 213 }, { "epoch": 0.06102081551183348, "grad_norm": 1.1823346555015843, "learning_rate": 9.975139275731649e-06, "loss": 0.0224, "step": 214 }, { "epoch": 0.061305959509552324, "grad_norm": 1.2000963098670507, "learning_rate": 9.97467715134164e-06, "loss": 0.0362, "step": 215 }, { "epoch": 0.06159110350727117, "grad_norm": 1.6613781060710882, "learning_rate": 9.974210782209113e-06, "loss": 0.0382, "step": 216 }, { "epoch": 0.06187624750499002, "grad_norm": 2.2297827161797406, "learning_rate": 9.973740168732006e-06, "loss": 0.0407, "step": 217 }, { "epoch": 0.06216139150270887, "grad_norm": 3.0544112277949713, "learning_rate": 9.973265311311877e-06, "loss": 0.0534, "step": 218 }, { "epoch": 0.06244653550042772, "grad_norm": 1.8606402993779663, "learning_rate": 9.972786210353913e-06, "loss": 0.031, "step": 219 }, { "epoch": 0.06273167949814656, "grad_norm": 1.9835976433808742, "learning_rate": 9.97230286626691e-06, "loss": 0.0512, "step": 220 }, { "epoch": 0.06301682349586542, "grad_norm": 2.4671374293137474, "learning_rate": 9.971815279463294e-06, "loss": 0.0261, "step": 221 }, { "epoch": 0.06330196749358426, "grad_norm": 1.9426433678441557, "learning_rate": 9.971323450359109e-06, "loss": 0.0354, "step": 222 }, { "epoch": 0.06358711149130311, "grad_norm": 2.833304865793606, "learning_rate": 9.970827379374016e-06, "loss": 0.0658, "step": 223 }, { "epoch": 0.06387225548902195, "grad_norm": 1.364065742400369, "learning_rate": 9.9703270669313e-06, "loss": 0.023, "step": 224 }, { "epoch": 0.0641573994867408, "grad_norm": 2.0598784617586783, "learning_rate": 9.96982251345786e-06, "loss": 0.0303, "step": 225 }, { "epoch": 0.06444254348445966, "grad_norm": 1.6800528252103155, "learning_rate": 9.969313719384217e-06, "loss": 0.0232, "step": 226 }, { "epoch": 0.0647276874821785, "grad_norm": 2.5347983635463067, "learning_rate": 9.96880068514451e-06, "loss": 0.051, "step": 227 }, { "epoch": 0.06501283147989735, "grad_norm": 4.198478034633089, "learning_rate": 9.968283411176499e-06, "loss": 0.0637, "step": 228 }, { "epoch": 0.06529797547761619, "grad_norm": 2.4608303176324013, "learning_rate": 9.967761897921553e-06, "loss": 0.0356, "step": 229 }, { "epoch": 0.06558311947533504, "grad_norm": 1.1830386086249287, "learning_rate": 9.967236145824666e-06, "loss": 0.0225, "step": 230 }, { "epoch": 0.0658682634730539, "grad_norm": 2.335905650803093, "learning_rate": 9.966706155334445e-06, "loss": 0.0484, "step": 231 }, { "epoch": 0.06615340747077274, "grad_norm": 0.7365538318490851, "learning_rate": 9.966171926903116e-06, "loss": 0.0242, "step": 232 }, { "epoch": 0.06643855146849159, "grad_norm": 0.8332673431312235, "learning_rate": 9.965633460986521e-06, "loss": 0.0143, "step": 233 }, { "epoch": 0.06672369546621043, "grad_norm": 1.1115342044179635, "learning_rate": 9.965090758044116e-06, "loss": 0.0315, "step": 234 }, { "epoch": 0.06700883946392928, "grad_norm": 2.828412875905856, "learning_rate": 9.964543818538974e-06, "loss": 0.0724, "step": 235 }, { "epoch": 0.06729398346164814, "grad_norm": 1.0745851388736904, "learning_rate": 9.963992642937782e-06, "loss": 0.0155, "step": 236 }, { "epoch": 0.06757912745936698, "grad_norm": 1.9642520660921505, "learning_rate": 9.963437231710838e-06, "loss": 0.0296, "step": 237 }, { "epoch": 0.06786427145708583, "grad_norm": 0.6051981185401907, "learning_rate": 9.962877585332062e-06, "loss": 0.0177, "step": 238 }, { "epoch": 0.06814941545480467, "grad_norm": 4.122674586469668, "learning_rate": 9.962313704278981e-06, "loss": 0.0523, "step": 239 }, { "epoch": 0.06843455945252352, "grad_norm": 1.6897654144243972, "learning_rate": 9.96174558903274e-06, "loss": 0.0132, "step": 240 }, { "epoch": 0.06871970345024238, "grad_norm": 3.1253283435389196, "learning_rate": 9.961173240078092e-06, "loss": 0.0321, "step": 241 }, { "epoch": 0.06900484744796122, "grad_norm": 1.3965076576538442, "learning_rate": 9.960596657903407e-06, "loss": 0.029, "step": 242 }, { "epoch": 0.06928999144568007, "grad_norm": 1.9660002863377228, "learning_rate": 9.960015843000666e-06, "loss": 0.0522, "step": 243 }, { "epoch": 0.06957513544339891, "grad_norm": 1.3233952783403555, "learning_rate": 9.959430795865457e-06, "loss": 0.0353, "step": 244 }, { "epoch": 0.06986027944111776, "grad_norm": 1.7033322453474304, "learning_rate": 9.958841516996989e-06, "loss": 0.0223, "step": 245 }, { "epoch": 0.07014542343883662, "grad_norm": 2.5317243818879764, "learning_rate": 9.95824800689807e-06, "loss": 0.0598, "step": 246 }, { "epoch": 0.07043056743655546, "grad_norm": 2.69732116281974, "learning_rate": 9.957650266075129e-06, "loss": 0.0419, "step": 247 }, { "epoch": 0.07071571143427431, "grad_norm": 1.4150311093853096, "learning_rate": 9.957048295038197e-06, "loss": 0.0472, "step": 248 }, { "epoch": 0.07100085543199315, "grad_norm": 1.2224281181933392, "learning_rate": 9.95644209430092e-06, "loss": 0.0389, "step": 249 }, { "epoch": 0.071285999429712, "grad_norm": 51.194853358315605, "learning_rate": 9.955831664380548e-06, "loss": 0.0451, "step": 250 }, { "epoch": 0.07157114342743086, "grad_norm": 1.46254711704071, "learning_rate": 9.955217005797946e-06, "loss": 0.0276, "step": 251 }, { "epoch": 0.0718562874251497, "grad_norm": 3.4636231015416974, "learning_rate": 9.954598119077583e-06, "loss": 0.0396, "step": 252 }, { "epoch": 0.07214143142286855, "grad_norm": 0.8518861435588755, "learning_rate": 9.953975004747535e-06, "loss": 0.0313, "step": 253 }, { "epoch": 0.07242657542058739, "grad_norm": 1.4742031582432285, "learning_rate": 9.953347663339487e-06, "loss": 0.0397, "step": 254 }, { "epoch": 0.07271171941830624, "grad_norm": 0.8505976022407974, "learning_rate": 9.95271609538873e-06, "loss": 0.0259, "step": 255 }, { "epoch": 0.0729968634160251, "grad_norm": 1.3056007814387238, "learning_rate": 9.952080301434165e-06, "loss": 0.0109, "step": 256 }, { "epoch": 0.07328200741374394, "grad_norm": 2.734719524535765, "learning_rate": 9.951440282018294e-06, "loss": 0.0424, "step": 257 }, { "epoch": 0.07356715141146279, "grad_norm": 0.915900286106093, "learning_rate": 9.950796037687224e-06, "loss": 0.0279, "step": 258 }, { "epoch": 0.07385229540918163, "grad_norm": 1.1576885381186377, "learning_rate": 9.950147568990672e-06, "loss": 0.0291, "step": 259 }, { "epoch": 0.07413743940690048, "grad_norm": 1.1396907116761987, "learning_rate": 9.949494876481957e-06, "loss": 0.0573, "step": 260 }, { "epoch": 0.07442258340461934, "grad_norm": 0.8954392948824229, "learning_rate": 9.948837960718001e-06, "loss": 0.0109, "step": 261 }, { "epoch": 0.07470772740233818, "grad_norm": 2.2187692519372955, "learning_rate": 9.94817682225933e-06, "loss": 0.0405, "step": 262 }, { "epoch": 0.07499287140005703, "grad_norm": 1.1502737187174694, "learning_rate": 9.947511461670076e-06, "loss": 0.0194, "step": 263 }, { "epoch": 0.07527801539777587, "grad_norm": 1.061028206812728, "learning_rate": 9.946841879517968e-06, "loss": 0.0419, "step": 264 }, { "epoch": 0.07556315939549473, "grad_norm": 1.3832069086922913, "learning_rate": 9.94616807637434e-06, "loss": 0.015, "step": 265 }, { "epoch": 0.07584830339321358, "grad_norm": 0.6459735906834944, "learning_rate": 9.945490052814133e-06, "loss": 0.0255, "step": 266 }, { "epoch": 0.07613344739093242, "grad_norm": 1.1494098773455543, "learning_rate": 9.94480780941588e-06, "loss": 0.0346, "step": 267 }, { "epoch": 0.07641859138865127, "grad_norm": 0.9289181620653825, "learning_rate": 9.944121346761718e-06, "loss": 0.014, "step": 268 }, { "epoch": 0.07670373538637011, "grad_norm": 1.2203907189231082, "learning_rate": 9.943430665437388e-06, "loss": 0.0167, "step": 269 }, { "epoch": 0.07698887938408897, "grad_norm": 2.620259167299579, "learning_rate": 9.942735766032228e-06, "loss": 0.044, "step": 270 }, { "epoch": 0.07727402338180782, "grad_norm": 1.7206246319986442, "learning_rate": 9.94203664913917e-06, "loss": 0.0235, "step": 271 }, { "epoch": 0.07755916737952666, "grad_norm": 1.4642402716762304, "learning_rate": 9.941333315354755e-06, "loss": 0.0258, "step": 272 }, { "epoch": 0.07784431137724551, "grad_norm": 1.7553631989445817, "learning_rate": 9.940625765279112e-06, "loss": 0.0345, "step": 273 }, { "epoch": 0.07812945537496435, "grad_norm": 1.6716431070541198, "learning_rate": 9.939913999515976e-06, "loss": 0.0454, "step": 274 }, { "epoch": 0.0784145993726832, "grad_norm": 1.9526398951956696, "learning_rate": 9.939198018672671e-06, "loss": 0.0382, "step": 275 }, { "epoch": 0.07869974337040206, "grad_norm": 1.2954788594239341, "learning_rate": 9.938477823360127e-06, "loss": 0.0309, "step": 276 }, { "epoch": 0.0789848873681209, "grad_norm": 4.4338813137709385, "learning_rate": 9.937753414192862e-06, "loss": 0.072, "step": 277 }, { "epoch": 0.07927003136583975, "grad_norm": 1.56094475712123, "learning_rate": 9.937024791788991e-06, "loss": 0.0517, "step": 278 }, { "epoch": 0.07955517536355859, "grad_norm": 1.7375848109236927, "learning_rate": 9.93629195677023e-06, "loss": 0.0345, "step": 279 }, { "epoch": 0.07984031936127745, "grad_norm": 2.649452584682016, "learning_rate": 9.935554909761882e-06, "loss": 0.0594, "step": 280 }, { "epoch": 0.0801254633589963, "grad_norm": 1.6499457117477292, "learning_rate": 9.93481365139285e-06, "loss": 0.0376, "step": 281 }, { "epoch": 0.08041060735671514, "grad_norm": 1.1619489575147839, "learning_rate": 9.934068182295622e-06, "loss": 0.04, "step": 282 }, { "epoch": 0.08069575135443399, "grad_norm": 1.7247725696145613, "learning_rate": 9.933318503106291e-06, "loss": 0.0323, "step": 283 }, { "epoch": 0.08098089535215283, "grad_norm": 2.1577985956712666, "learning_rate": 9.93256461446453e-06, "loss": 0.0526, "step": 284 }, { "epoch": 0.08126603934987169, "grad_norm": 0.8867124373033844, "learning_rate": 9.931806517013612e-06, "loss": 0.0144, "step": 285 }, { "epoch": 0.08155118334759054, "grad_norm": 0.9443505140091544, "learning_rate": 9.9310442114004e-06, "loss": 0.0233, "step": 286 }, { "epoch": 0.08183632734530938, "grad_norm": 0.8757124141344964, "learning_rate": 9.930277698275347e-06, "loss": 0.0235, "step": 287 }, { "epoch": 0.08212147134302823, "grad_norm": 2.2248864075955805, "learning_rate": 9.92950697829249e-06, "loss": 0.0495, "step": 288 }, { "epoch": 0.08240661534074707, "grad_norm": 1.794320095937349, "learning_rate": 9.928732052109466e-06, "loss": 0.0575, "step": 289 }, { "epoch": 0.08269175933846593, "grad_norm": 0.5271974530133274, "learning_rate": 9.927952920387497e-06, "loss": 0.0174, "step": 290 }, { "epoch": 0.08297690333618478, "grad_norm": 2.130600355467149, "learning_rate": 9.92716958379139e-06, "loss": 0.0269, "step": 291 }, { "epoch": 0.08326204733390362, "grad_norm": 1.6078717603842572, "learning_rate": 9.926382042989544e-06, "loss": 0.0381, "step": 292 }, { "epoch": 0.08354719133162247, "grad_norm": 1.067101418411665, "learning_rate": 9.925590298653942e-06, "loss": 0.0416, "step": 293 }, { "epoch": 0.08383233532934131, "grad_norm": 1.8888059772289436, "learning_rate": 9.924794351460159e-06, "loss": 0.0431, "step": 294 }, { "epoch": 0.08411747932706017, "grad_norm": 1.1366438944954262, "learning_rate": 9.92399420208735e-06, "loss": 0.034, "step": 295 }, { "epoch": 0.08440262332477902, "grad_norm": 66.83013849628288, "learning_rate": 9.923189851218259e-06, "loss": 0.303, "step": 296 }, { "epoch": 0.08468776732249786, "grad_norm": 2.2777335388392643, "learning_rate": 9.922381299539214e-06, "loss": 0.0426, "step": 297 }, { "epoch": 0.08497291132021671, "grad_norm": 1.3777833351571647, "learning_rate": 9.921568547740131e-06, "loss": 0.0551, "step": 298 }, { "epoch": 0.08525805531793555, "grad_norm": 0.6388835255348255, "learning_rate": 9.920751596514502e-06, "loss": 0.0228, "step": 299 }, { "epoch": 0.0855431993156544, "grad_norm": 2.278298167977118, "learning_rate": 9.919930446559412e-06, "loss": 0.0349, "step": 300 }, { "epoch": 0.08582834331337326, "grad_norm": 0.873059407793711, "learning_rate": 9.91910509857552e-06, "loss": 0.0212, "step": 301 }, { "epoch": 0.0861134873110921, "grad_norm": 2.2132859296620038, "learning_rate": 9.918275553267069e-06, "loss": 0.0439, "step": 302 }, { "epoch": 0.08639863130881095, "grad_norm": 2.615477056500074, "learning_rate": 9.917441811341887e-06, "loss": 0.0299, "step": 303 }, { "epoch": 0.08668377530652979, "grad_norm": 1.0557781544658622, "learning_rate": 9.916603873511386e-06, "loss": 0.0158, "step": 304 }, { "epoch": 0.08696891930424865, "grad_norm": 3.9400306558554754, "learning_rate": 9.915761740490545e-06, "loss": 0.0444, "step": 305 }, { "epoch": 0.0872540633019675, "grad_norm": 3.1606798637531544, "learning_rate": 9.914915412997937e-06, "loss": 0.035, "step": 306 }, { "epoch": 0.08753920729968634, "grad_norm": 2.2379796936077754, "learning_rate": 9.914064891755703e-06, "loss": 0.0353, "step": 307 }, { "epoch": 0.08782435129740519, "grad_norm": 2.0924722936161007, "learning_rate": 9.91321017748957e-06, "loss": 0.0466, "step": 308 }, { "epoch": 0.08810949529512403, "grad_norm": 1.1933158739643803, "learning_rate": 9.91235127092884e-06, "loss": 0.0237, "step": 309 }, { "epoch": 0.08839463929284289, "grad_norm": 1.450838486064019, "learning_rate": 9.911488172806392e-06, "loss": 0.0296, "step": 310 }, { "epoch": 0.08867978329056174, "grad_norm": 2.8659368821250095, "learning_rate": 9.91062088385868e-06, "loss": 0.0644, "step": 311 }, { "epoch": 0.08896492728828058, "grad_norm": 2.3883067154612823, "learning_rate": 9.909749404825736e-06, "loss": 0.0558, "step": 312 }, { "epoch": 0.08925007128599943, "grad_norm": 1.338846186227719, "learning_rate": 9.90887373645117e-06, "loss": 0.0142, "step": 313 }, { "epoch": 0.08953521528371827, "grad_norm": 2.2418774559520105, "learning_rate": 9.907993879482161e-06, "loss": 0.0362, "step": 314 }, { "epoch": 0.08982035928143713, "grad_norm": 1.5944639604254414, "learning_rate": 9.907109834669465e-06, "loss": 0.0277, "step": 315 }, { "epoch": 0.09010550327915598, "grad_norm": 1.8710437857203388, "learning_rate": 9.90622160276741e-06, "loss": 0.0235, "step": 316 }, { "epoch": 0.09039064727687482, "grad_norm": 1.4117522494101777, "learning_rate": 9.905329184533897e-06, "loss": 0.0328, "step": 317 }, { "epoch": 0.09067579127459367, "grad_norm": 1.2111494335027815, "learning_rate": 9.904432580730404e-06, "loss": 0.0445, "step": 318 }, { "epoch": 0.09096093527231251, "grad_norm": 1.0923531873837928, "learning_rate": 9.90353179212197e-06, "loss": 0.0363, "step": 319 }, { "epoch": 0.09124607927003137, "grad_norm": 0.9108263761049934, "learning_rate": 9.902626819477214e-06, "loss": 0.0233, "step": 320 }, { "epoch": 0.09153122326775022, "grad_norm": 0.9052416100597438, "learning_rate": 9.901717663568323e-06, "loss": 0.0221, "step": 321 }, { "epoch": 0.09181636726546906, "grad_norm": 1.5417885755673133, "learning_rate": 9.900804325171052e-06, "loss": 0.0357, "step": 322 }, { "epoch": 0.09210151126318791, "grad_norm": 1.1483931076981324, "learning_rate": 9.899886805064723e-06, "loss": 0.042, "step": 323 }, { "epoch": 0.09238665526090675, "grad_norm": 0.8264374345543277, "learning_rate": 9.89896510403223e-06, "loss": 0.0135, "step": 324 }, { "epoch": 0.0926717992586256, "grad_norm": 3.028712934159954, "learning_rate": 9.898039222860032e-06, "loss": 0.096, "step": 325 }, { "epoch": 0.09295694325634446, "grad_norm": 3.908443382779185, "learning_rate": 9.897109162338157e-06, "loss": 0.0678, "step": 326 }, { "epoch": 0.0932420872540633, "grad_norm": 1.4504578551621745, "learning_rate": 9.896174923260198e-06, "loss": 0.028, "step": 327 }, { "epoch": 0.09352723125178215, "grad_norm": 1.0918597452992596, "learning_rate": 9.89523650642331e-06, "loss": 0.0352, "step": 328 }, { "epoch": 0.09381237524950099, "grad_norm": 1.3624134945746003, "learning_rate": 9.89429391262822e-06, "loss": 0.034, "step": 329 }, { "epoch": 0.09409751924721985, "grad_norm": 1.9386090064569985, "learning_rate": 9.893347142679211e-06, "loss": 0.0213, "step": 330 }, { "epoch": 0.0943826632449387, "grad_norm": 2.442430474342043, "learning_rate": 9.892396197384135e-06, "loss": 0.0274, "step": 331 }, { "epoch": 0.09466780724265754, "grad_norm": 2.7994856234060372, "learning_rate": 9.891441077554405e-06, "loss": 0.0496, "step": 332 }, { "epoch": 0.0949529512403764, "grad_norm": 1.0457890691940244, "learning_rate": 9.890481784004998e-06, "loss": 0.0197, "step": 333 }, { "epoch": 0.09523809523809523, "grad_norm": 6.1017012731170865, "learning_rate": 9.889518317554446e-06, "loss": 0.047, "step": 334 }, { "epoch": 0.09552323923581409, "grad_norm": 2.090726587494363, "learning_rate": 9.88855067902485e-06, "loss": 0.0326, "step": 335 }, { "epoch": 0.09580838323353294, "grad_norm": 0.699893313587921, "learning_rate": 9.887578869241866e-06, "loss": 0.0164, "step": 336 }, { "epoch": 0.09609352723125178, "grad_norm": 3.507183876219138, "learning_rate": 9.886602889034709e-06, "loss": 0.0416, "step": 337 }, { "epoch": 0.09637867122897063, "grad_norm": 0.5771253616906034, "learning_rate": 9.885622739236154e-06, "loss": 0.0161, "step": 338 }, { "epoch": 0.09666381522668947, "grad_norm": 1.231417111947857, "learning_rate": 9.884638420682534e-06, "loss": 0.0269, "step": 339 }, { "epoch": 0.09694895922440833, "grad_norm": 1.7586310033147416, "learning_rate": 9.883649934213738e-06, "loss": 0.0303, "step": 340 }, { "epoch": 0.09723410322212718, "grad_norm": 1.6088417745313677, "learning_rate": 9.882657280673212e-06, "loss": 0.0277, "step": 341 }, { "epoch": 0.09751924721984602, "grad_norm": 1.8001784084893846, "learning_rate": 9.881660460907957e-06, "loss": 0.0232, "step": 342 }, { "epoch": 0.09780439121756487, "grad_norm": 3.549553604790123, "learning_rate": 9.880659475768526e-06, "loss": 0.1015, "step": 343 }, { "epoch": 0.09808953521528371, "grad_norm": 1.0138498172858956, "learning_rate": 9.879654326109037e-06, "loss": 0.0219, "step": 344 }, { "epoch": 0.09837467921300257, "grad_norm": 0.5962872642185872, "learning_rate": 9.878645012787149e-06, "loss": 0.0073, "step": 345 }, { "epoch": 0.09865982321072142, "grad_norm": 1.5344142515023333, "learning_rate": 9.87763153666408e-06, "loss": 0.0188, "step": 346 }, { "epoch": 0.09894496720844026, "grad_norm": 2.6171999253216014, "learning_rate": 9.8766138986046e-06, "loss": 0.0448, "step": 347 }, { "epoch": 0.09923011120615911, "grad_norm": 2.0090501521581885, "learning_rate": 9.875592099477025e-06, "loss": 0.0481, "step": 348 }, { "epoch": 0.09951525520387795, "grad_norm": 2.3354612758931386, "learning_rate": 9.874566140153228e-06, "loss": 0.0417, "step": 349 }, { "epoch": 0.0998003992015968, "grad_norm": 1.5781598957515535, "learning_rate": 9.87353602150863e-06, "loss": 0.0188, "step": 350 }, { "epoch": 0.10008554319931566, "grad_norm": 1.928184575603502, "learning_rate": 9.8725017444222e-06, "loss": 0.0234, "step": 351 }, { "epoch": 0.1003706871970345, "grad_norm": 1.0876133323980701, "learning_rate": 9.871463309776455e-06, "loss": 0.009, "step": 352 }, { "epoch": 0.10065583119475335, "grad_norm": 11.324227273778364, "learning_rate": 9.870420718457458e-06, "loss": 0.011, "step": 353 }, { "epoch": 0.1009409751924722, "grad_norm": 2.3272703667045516, "learning_rate": 9.869373971354826e-06, "loss": 0.0427, "step": 354 }, { "epoch": 0.10122611919019105, "grad_norm": 1.2649728160919362, "learning_rate": 9.868323069361712e-06, "loss": 0.0259, "step": 355 }, { "epoch": 0.1015112631879099, "grad_norm": 2.0664662137667467, "learning_rate": 9.867268013374822e-06, "loss": 0.0226, "step": 356 }, { "epoch": 0.10179640718562874, "grad_norm": 0.9365874475796875, "learning_rate": 9.866208804294401e-06, "loss": 0.0107, "step": 357 }, { "epoch": 0.1020815511833476, "grad_norm": 0.5822681312488291, "learning_rate": 9.865145443024243e-06, "loss": 0.0073, "step": 358 }, { "epoch": 0.10236669518106643, "grad_norm": 0.5773090002383701, "learning_rate": 9.86407793047168e-06, "loss": 0.0064, "step": 359 }, { "epoch": 0.10265183917878529, "grad_norm": 1.298097020704675, "learning_rate": 9.863006267547591e-06, "loss": 0.0293, "step": 360 }, { "epoch": 0.10293698317650414, "grad_norm": 2.01105743317194, "learning_rate": 9.861930455166392e-06, "loss": 0.0423, "step": 361 }, { "epoch": 0.10322212717422298, "grad_norm": 3.854044900737908, "learning_rate": 9.86085049424604e-06, "loss": 0.0646, "step": 362 }, { "epoch": 0.10350727117194183, "grad_norm": 3.1428521017863393, "learning_rate": 9.859766385708035e-06, "loss": 0.0603, "step": 363 }, { "epoch": 0.10379241516966067, "grad_norm": 1.1415144666210106, "learning_rate": 9.858678130477415e-06, "loss": 0.0089, "step": 364 }, { "epoch": 0.10407755916737953, "grad_norm": 1.4442748658078282, "learning_rate": 9.857585729482753e-06, "loss": 0.0271, "step": 365 }, { "epoch": 0.10436270316509838, "grad_norm": 0.8289767742462578, "learning_rate": 9.856489183656163e-06, "loss": 0.0159, "step": 366 }, { "epoch": 0.10464784716281722, "grad_norm": 1.7124474237003318, "learning_rate": 9.855388493933298e-06, "loss": 0.0527, "step": 367 }, { "epoch": 0.10493299116053607, "grad_norm": 2.981703921126902, "learning_rate": 9.854283661253338e-06, "loss": 0.0357, "step": 368 }, { "epoch": 0.10521813515825491, "grad_norm": 2.1585606753904965, "learning_rate": 9.853174686559006e-06, "loss": 0.0393, "step": 369 }, { "epoch": 0.10550327915597377, "grad_norm": 1.5348910023395037, "learning_rate": 9.852061570796557e-06, "loss": 0.0398, "step": 370 }, { "epoch": 0.10578842315369262, "grad_norm": 1.535951948243033, "learning_rate": 9.85094431491578e-06, "loss": 0.0208, "step": 371 }, { "epoch": 0.10607356715141146, "grad_norm": 1.0394143122475457, "learning_rate": 9.849822919869993e-06, "loss": 0.0127, "step": 372 }, { "epoch": 0.10635871114913031, "grad_norm": 1.8866287812876101, "learning_rate": 9.848697386616052e-06, "loss": 0.0243, "step": 373 }, { "epoch": 0.10664385514684915, "grad_norm": 2.152613819596744, "learning_rate": 9.847567716114339e-06, "loss": 0.0475, "step": 374 }, { "epoch": 0.10692899914456801, "grad_norm": 1.8499798292082466, "learning_rate": 9.846433909328768e-06, "loss": 0.0482, "step": 375 }, { "epoch": 0.10721414314228686, "grad_norm": 2.1403198834038255, "learning_rate": 9.845295967226782e-06, "loss": 0.0359, "step": 376 }, { "epoch": 0.1074992871400057, "grad_norm": 0.679326626935762, "learning_rate": 9.844153890779352e-06, "loss": 0.014, "step": 377 }, { "epoch": 0.10778443113772455, "grad_norm": 2.4149761091081277, "learning_rate": 9.84300768096098e-06, "loss": 0.035, "step": 378 }, { "epoch": 0.1080695751354434, "grad_norm": 1.1906254157269311, "learning_rate": 9.841857338749693e-06, "loss": 0.0321, "step": 379 }, { "epoch": 0.10835471913316225, "grad_norm": 0.5918164701274412, "learning_rate": 9.840702865127039e-06, "loss": 0.0201, "step": 380 }, { "epoch": 0.1086398631308811, "grad_norm": 1.7497628829871825, "learning_rate": 9.839544261078099e-06, "loss": 0.0211, "step": 381 }, { "epoch": 0.10892500712859994, "grad_norm": 1.3983788980140046, "learning_rate": 9.838381527591475e-06, "loss": 0.0236, "step": 382 }, { "epoch": 0.1092101511263188, "grad_norm": 3.1851074927752623, "learning_rate": 9.83721466565929e-06, "loss": 0.0372, "step": 383 }, { "epoch": 0.10949529512403763, "grad_norm": 1.4171014306009728, "learning_rate": 9.836043676277195e-06, "loss": 0.0252, "step": 384 }, { "epoch": 0.10978043912175649, "grad_norm": 0.9261846959930898, "learning_rate": 9.83486856044436e-06, "loss": 0.0191, "step": 385 }, { "epoch": 0.11006558311947534, "grad_norm": 1.6296524588712908, "learning_rate": 9.833689319163473e-06, "loss": 0.0576, "step": 386 }, { "epoch": 0.11035072711719418, "grad_norm": 0.8019311699722914, "learning_rate": 9.832505953440748e-06, "loss": 0.0202, "step": 387 }, { "epoch": 0.11063587111491303, "grad_norm": 1.5113990773927573, "learning_rate": 9.831318464285914e-06, "loss": 0.046, "step": 388 }, { "epoch": 0.11092101511263187, "grad_norm": 2.728229672787799, "learning_rate": 9.83012685271222e-06, "loss": 0.0537, "step": 389 }, { "epoch": 0.11120615911035073, "grad_norm": 0.913884511065414, "learning_rate": 9.828931119736435e-06, "loss": 0.0102, "step": 390 }, { "epoch": 0.11149130310806958, "grad_norm": 4.01446357656375, "learning_rate": 9.827731266378839e-06, "loss": 0.061, "step": 391 }, { "epoch": 0.11177644710578842, "grad_norm": 1.3370546467948305, "learning_rate": 9.82652729366323e-06, "loss": 0.0371, "step": 392 }, { "epoch": 0.11206159110350727, "grad_norm": 1.3728857509837673, "learning_rate": 9.825319202616926e-06, "loss": 0.0258, "step": 393 }, { "epoch": 0.11234673510122611, "grad_norm": 1.8638416100609516, "learning_rate": 9.82410699427075e-06, "loss": 0.0386, "step": 394 }, { "epoch": 0.11263187909894497, "grad_norm": 0.7022867943321849, "learning_rate": 9.822890669659044e-06, "loss": 0.0144, "step": 395 }, { "epoch": 0.11291702309666382, "grad_norm": 1.4752375376718947, "learning_rate": 9.821670229819663e-06, "loss": 0.049, "step": 396 }, { "epoch": 0.11320216709438266, "grad_norm": 0.7023676759664046, "learning_rate": 9.820445675793973e-06, "loss": 0.0241, "step": 397 }, { "epoch": 0.11348731109210151, "grad_norm": 0.49880724393845305, "learning_rate": 9.819217008626847e-06, "loss": 0.0241, "step": 398 }, { "epoch": 0.11377245508982035, "grad_norm": 2.064257083055477, "learning_rate": 9.817984229366669e-06, "loss": 0.058, "step": 399 }, { "epoch": 0.11405759908753921, "grad_norm": 0.5760537754446529, "learning_rate": 9.816747339065333e-06, "loss": 0.0233, "step": 400 }, { "epoch": 0.11434274308525806, "grad_norm": 1.2124787291362593, "learning_rate": 9.81550633877824e-06, "loss": 0.0614, "step": 401 }, { "epoch": 0.1146278870829769, "grad_norm": 1.5895231814003101, "learning_rate": 9.8142612295643e-06, "loss": 0.0569, "step": 402 }, { "epoch": 0.11491303108069575, "grad_norm": 1.0484784015774489, "learning_rate": 9.813012012485925e-06, "loss": 0.0161, "step": 403 }, { "epoch": 0.1151981750784146, "grad_norm": 2.0015024924134988, "learning_rate": 9.811758688609036e-06, "loss": 0.0288, "step": 404 }, { "epoch": 0.11548331907613345, "grad_norm": 1.0183121982064807, "learning_rate": 9.810501259003058e-06, "loss": 0.0272, "step": 405 }, { "epoch": 0.1157684630738523, "grad_norm": 0.7040323962320703, "learning_rate": 9.809239724740913e-06, "loss": 0.0127, "step": 406 }, { "epoch": 0.11605360707157114, "grad_norm": 1.2738847744090185, "learning_rate": 9.807974086899037e-06, "loss": 0.0326, "step": 407 }, { "epoch": 0.11633875106929, "grad_norm": 1.6928730590425576, "learning_rate": 9.806704346557354e-06, "loss": 0.0467, "step": 408 }, { "epoch": 0.11662389506700883, "grad_norm": 1.5237688876565934, "learning_rate": 9.8054305047993e-06, "loss": 0.0428, "step": 409 }, { "epoch": 0.11690903906472769, "grad_norm": 2.5601247962488047, "learning_rate": 9.804152562711804e-06, "loss": 0.0312, "step": 410 }, { "epoch": 0.11719418306244654, "grad_norm": 1.6898452734577185, "learning_rate": 9.802870521385295e-06, "loss": 0.0165, "step": 411 }, { "epoch": 0.11747932706016538, "grad_norm": 3.202435062260781, "learning_rate": 9.801584381913702e-06, "loss": 0.0365, "step": 412 }, { "epoch": 0.11776447105788423, "grad_norm": 2.4208311464375267, "learning_rate": 9.800294145394449e-06, "loss": 0.0456, "step": 413 }, { "epoch": 0.11804961505560307, "grad_norm": 1.8464478511128473, "learning_rate": 9.798999812928454e-06, "loss": 0.0275, "step": 414 }, { "epoch": 0.11833475905332193, "grad_norm": 1.7466714628297064, "learning_rate": 9.797701385620135e-06, "loss": 0.0664, "step": 415 }, { "epoch": 0.11861990305104078, "grad_norm": 1.374398177525167, "learning_rate": 9.796398864577398e-06, "loss": 0.0249, "step": 416 }, { "epoch": 0.11890504704875962, "grad_norm": 0.682379469399641, "learning_rate": 9.795092250911646e-06, "loss": 0.0324, "step": 417 }, { "epoch": 0.11919019104647847, "grad_norm": 1.345099432009825, "learning_rate": 9.793781545737775e-06, "loss": 0.0304, "step": 418 }, { "epoch": 0.11947533504419731, "grad_norm": 2.459208618588478, "learning_rate": 9.79246675017417e-06, "loss": 0.038, "step": 419 }, { "epoch": 0.11976047904191617, "grad_norm": 1.1582497240978238, "learning_rate": 9.791147865342703e-06, "loss": 0.0239, "step": 420 }, { "epoch": 0.12004562303963502, "grad_norm": 0.6873012288522815, "learning_rate": 9.789824892368742e-06, "loss": 0.0219, "step": 421 }, { "epoch": 0.12033076703735386, "grad_norm": 1.4503792824156856, "learning_rate": 9.78849783238114e-06, "loss": 0.039, "step": 422 }, { "epoch": 0.12061591103507271, "grad_norm": 1.4695551960581217, "learning_rate": 9.787166686512237e-06, "loss": 0.0225, "step": 423 }, { "epoch": 0.12090105503279155, "grad_norm": 1.0755209767496783, "learning_rate": 9.785831455897859e-06, "loss": 0.0187, "step": 424 }, { "epoch": 0.12118619903051041, "grad_norm": 2.8435317285492223, "learning_rate": 9.784492141677318e-06, "loss": 0.0377, "step": 425 }, { "epoch": 0.12147134302822926, "grad_norm": 0.6194176889222343, "learning_rate": 9.783148744993413e-06, "loss": 0.014, "step": 426 }, { "epoch": 0.1217564870259481, "grad_norm": 1.197556570522883, "learning_rate": 9.781801266992421e-06, "loss": 0.0144, "step": 427 }, { "epoch": 0.12204163102366695, "grad_norm": 1.509780270356931, "learning_rate": 9.780449708824107e-06, "loss": 0.0149, "step": 428 }, { "epoch": 0.1223267750213858, "grad_norm": 1.0425109169214848, "learning_rate": 9.779094071641712e-06, "loss": 0.0287, "step": 429 }, { "epoch": 0.12261191901910465, "grad_norm": 2.9496669420120383, "learning_rate": 9.777734356601964e-06, "loss": 0.0152, "step": 430 }, { "epoch": 0.1228970630168235, "grad_norm": 1.4441615240576289, "learning_rate": 9.776370564865066e-06, "loss": 0.0265, "step": 431 }, { "epoch": 0.12318220701454234, "grad_norm": 2.8617609288687773, "learning_rate": 9.775002697594696e-06, "loss": 0.0259, "step": 432 }, { "epoch": 0.1234673510122612, "grad_norm": 2.019269634607327, "learning_rate": 9.773630755958021e-06, "loss": 0.0419, "step": 433 }, { "epoch": 0.12375249500998003, "grad_norm": 3.762297631517556, "learning_rate": 9.772254741125672e-06, "loss": 0.0328, "step": 434 }, { "epoch": 0.12403763900769889, "grad_norm": 1.9417548664384192, "learning_rate": 9.770874654271768e-06, "loss": 0.0572, "step": 435 }, { "epoch": 0.12432278300541774, "grad_norm": 1.7851777881726447, "learning_rate": 9.769490496573886e-06, "loss": 0.049, "step": 436 }, { "epoch": 0.12460792700313658, "grad_norm": 2.477982800391376, "learning_rate": 9.768102269213093e-06, "loss": 0.0705, "step": 437 }, { "epoch": 0.12489307100085544, "grad_norm": 0.3032358120092834, "learning_rate": 9.76670997337392e-06, "loss": 0.0047, "step": 438 }, { "epoch": 0.1251782149985743, "grad_norm": 1.483642571394467, "learning_rate": 9.765313610244372e-06, "loss": 0.0151, "step": 439 }, { "epoch": 0.12546335899629313, "grad_norm": 1.814367508425431, "learning_rate": 9.763913181015923e-06, "loss": 0.0459, "step": 440 }, { "epoch": 0.12574850299401197, "grad_norm": 0.7952597090870602, "learning_rate": 9.762508686883515e-06, "loss": 0.0201, "step": 441 }, { "epoch": 0.12603364699173084, "grad_norm": 3.185384390855697, "learning_rate": 9.761100129045565e-06, "loss": 0.0432, "step": 442 }, { "epoch": 0.12631879098944968, "grad_norm": 1.0661696789911552, "learning_rate": 9.759687508703948e-06, "loss": 0.0139, "step": 443 }, { "epoch": 0.12660393498716851, "grad_norm": 3.7725165197415964, "learning_rate": 9.758270827064016e-06, "loss": 0.0483, "step": 444 }, { "epoch": 0.12688907898488735, "grad_norm": 2.746968288043148, "learning_rate": 9.756850085334576e-06, "loss": 0.0574, "step": 445 }, { "epoch": 0.12717422298260622, "grad_norm": 0.926770487815625, "learning_rate": 9.755425284727908e-06, "loss": 0.018, "step": 446 }, { "epoch": 0.12745936698032506, "grad_norm": 0.9082707028939784, "learning_rate": 9.753996426459748e-06, "loss": 0.0263, "step": 447 }, { "epoch": 0.1277445109780439, "grad_norm": 1.164510189232619, "learning_rate": 9.752563511749301e-06, "loss": 0.0337, "step": 448 }, { "epoch": 0.12802965497576277, "grad_norm": 2.3418912893557704, "learning_rate": 9.75112654181923e-06, "loss": 0.0416, "step": 449 }, { "epoch": 0.1283147989734816, "grad_norm": 4.056148524420801, "learning_rate": 9.749685517895654e-06, "loss": 0.0539, "step": 450 }, { "epoch": 0.12859994297120045, "grad_norm": 1.3521227970804115, "learning_rate": 9.748240441208158e-06, "loss": 0.0395, "step": 451 }, { "epoch": 0.12888508696891932, "grad_norm": 1.9419704386778984, "learning_rate": 9.746791312989785e-06, "loss": 0.0393, "step": 452 }, { "epoch": 0.12917023096663816, "grad_norm": 1.0595383076344087, "learning_rate": 9.745338134477031e-06, "loss": 0.0214, "step": 453 }, { "epoch": 0.129455374964357, "grad_norm": 1.4679175053059728, "learning_rate": 9.743880906909849e-06, "loss": 0.0525, "step": 454 }, { "epoch": 0.12974051896207583, "grad_norm": 1.4310123455755626, "learning_rate": 9.742419631531647e-06, "loss": 0.0158, "step": 455 }, { "epoch": 0.1300256629597947, "grad_norm": 0.6588067935353689, "learning_rate": 9.740954309589288e-06, "loss": 0.0277, "step": 456 }, { "epoch": 0.13031080695751354, "grad_norm": 0.9397591945163838, "learning_rate": 9.739484942333087e-06, "loss": 0.0312, "step": 457 }, { "epoch": 0.13059595095523238, "grad_norm": 1.1729515696135353, "learning_rate": 9.738011531016809e-06, "loss": 0.0369, "step": 458 }, { "epoch": 0.13088109495295125, "grad_norm": 2.9106198831897734, "learning_rate": 9.736534076897676e-06, "loss": 0.0698, "step": 459 }, { "epoch": 0.1311662389506701, "grad_norm": 2.418685278178657, "learning_rate": 9.735052581236353e-06, "loss": 0.0529, "step": 460 }, { "epoch": 0.13145138294838893, "grad_norm": 1.1991557936515589, "learning_rate": 9.733567045296955e-06, "loss": 0.019, "step": 461 }, { "epoch": 0.1317365269461078, "grad_norm": 2.168170383166244, "learning_rate": 9.732077470347043e-06, "loss": 0.0336, "step": 462 }, { "epoch": 0.13202167094382664, "grad_norm": 0.8382241005884289, "learning_rate": 9.730583857657632e-06, "loss": 0.0241, "step": 463 }, { "epoch": 0.13230681494154548, "grad_norm": 1.1602419211992863, "learning_rate": 9.729086208503174e-06, "loss": 0.0285, "step": 464 }, { "epoch": 0.13259195893926431, "grad_norm": 2.3952533816912234, "learning_rate": 9.727584524161568e-06, "loss": 0.062, "step": 465 }, { "epoch": 0.13287710293698318, "grad_norm": 3.1634854012983693, "learning_rate": 9.726078805914156e-06, "loss": 0.0585, "step": 466 }, { "epoch": 0.13316224693470202, "grad_norm": 2.081821935988785, "learning_rate": 9.724569055045722e-06, "loss": 0.0497, "step": 467 }, { "epoch": 0.13344739093242086, "grad_norm": 1.7133129071243365, "learning_rate": 9.723055272844492e-06, "loss": 0.0231, "step": 468 }, { "epoch": 0.13373253493013973, "grad_norm": 1.3379025505204591, "learning_rate": 9.72153746060213e-06, "loss": 0.0296, "step": 469 }, { "epoch": 0.13401767892785857, "grad_norm": 2.4905984184567176, "learning_rate": 9.720015619613738e-06, "loss": 0.0341, "step": 470 }, { "epoch": 0.1343028229255774, "grad_norm": 1.7732686088505376, "learning_rate": 9.718489751177863e-06, "loss": 0.046, "step": 471 }, { "epoch": 0.13458796692329628, "grad_norm": 1.4099272711066784, "learning_rate": 9.716959856596476e-06, "loss": 0.0517, "step": 472 }, { "epoch": 0.13487311092101512, "grad_norm": 0.5759751962800088, "learning_rate": 9.715425937174992e-06, "loss": 0.0205, "step": 473 }, { "epoch": 0.13515825491873396, "grad_norm": 1.8213929340301374, "learning_rate": 9.71388799422226e-06, "loss": 0.0424, "step": 474 }, { "epoch": 0.1354433989164528, "grad_norm": 1.1189923917553266, "learning_rate": 9.712346029050561e-06, "loss": 0.0261, "step": 475 }, { "epoch": 0.13572854291417166, "grad_norm": 0.7740540942029566, "learning_rate": 9.710800042975604e-06, "loss": 0.0229, "step": 476 }, { "epoch": 0.1360136869118905, "grad_norm": 0.7017604368884549, "learning_rate": 9.709250037316535e-06, "loss": 0.0189, "step": 477 }, { "epoch": 0.13629883090960934, "grad_norm": 0.8260830471935292, "learning_rate": 9.707696013395929e-06, "loss": 0.0222, "step": 478 }, { "epoch": 0.1365839749073282, "grad_norm": 1.2766002745562819, "learning_rate": 9.706137972539784e-06, "loss": 0.0264, "step": 479 }, { "epoch": 0.13686911890504705, "grad_norm": 1.316995242325395, "learning_rate": 9.70457591607753e-06, "loss": 0.029, "step": 480 }, { "epoch": 0.1371542629027659, "grad_norm": 1.3031733188507375, "learning_rate": 9.703009845342027e-06, "loss": 0.0306, "step": 481 }, { "epoch": 0.13743940690048476, "grad_norm": 2.0441243406454515, "learning_rate": 9.701439761669551e-06, "loss": 0.0384, "step": 482 }, { "epoch": 0.1377245508982036, "grad_norm": 4.4329827478789765, "learning_rate": 9.69986566639981e-06, "loss": 0.0669, "step": 483 }, { "epoch": 0.13800969489592244, "grad_norm": 0.550863641831382, "learning_rate": 9.698287560875932e-06, "loss": 0.011, "step": 484 }, { "epoch": 0.13829483889364128, "grad_norm": 1.367528196902721, "learning_rate": 9.696705446444465e-06, "loss": 0.0475, "step": 485 }, { "epoch": 0.13857998289136014, "grad_norm": 2.9049463748248625, "learning_rate": 9.695119324455383e-06, "loss": 0.048, "step": 486 }, { "epoch": 0.13886512688907898, "grad_norm": 2.152275954468813, "learning_rate": 9.693529196262073e-06, "loss": 0.0426, "step": 487 }, { "epoch": 0.13915027088679782, "grad_norm": 1.4509315149948045, "learning_rate": 9.691935063221347e-06, "loss": 0.0371, "step": 488 }, { "epoch": 0.1394354148845167, "grad_norm": 0.924165717850074, "learning_rate": 9.690336926693427e-06, "loss": 0.0165, "step": 489 }, { "epoch": 0.13972055888223553, "grad_norm": 1.5645440774065995, "learning_rate": 9.688734788041958e-06, "loss": 0.0222, "step": 490 }, { "epoch": 0.14000570287995437, "grad_norm": 1.8857089742319397, "learning_rate": 9.687128648633995e-06, "loss": 0.0338, "step": 491 }, { "epoch": 0.14029084687767324, "grad_norm": 1.2496068613960891, "learning_rate": 9.685518509840008e-06, "loss": 0.0393, "step": 492 }, { "epoch": 0.14057599087539208, "grad_norm": 2.2609376738497797, "learning_rate": 9.683904373033884e-06, "loss": 0.0424, "step": 493 }, { "epoch": 0.14086113487311092, "grad_norm": 0.6716431956995677, "learning_rate": 9.682286239592912e-06, "loss": 0.0189, "step": 494 }, { "epoch": 0.14114627887082976, "grad_norm": 1.8445797069209673, "learning_rate": 9.6806641108978e-06, "loss": 0.0467, "step": 495 }, { "epoch": 0.14143142286854862, "grad_norm": 1.039428374248318, "learning_rate": 9.67903798833266e-06, "loss": 0.0296, "step": 496 }, { "epoch": 0.14171656686626746, "grad_norm": 2.241000783231958, "learning_rate": 9.677407873285016e-06, "loss": 0.021, "step": 497 }, { "epoch": 0.1420017108639863, "grad_norm": 2.1580510158063904, "learning_rate": 9.675773767145795e-06, "loss": 0.0404, "step": 498 }, { "epoch": 0.14228685486170517, "grad_norm": 2.1848379943693854, "learning_rate": 9.674135671309329e-06, "loss": 0.0392, "step": 499 }, { "epoch": 0.142571998859424, "grad_norm": 1.2914001435408309, "learning_rate": 9.672493587173356e-06, "loss": 0.0193, "step": 500 }, { "epoch": 0.14285714285714285, "grad_norm": 1.6915143202313678, "learning_rate": 9.670847516139019e-06, "loss": 0.0287, "step": 501 }, { "epoch": 0.14314228685486172, "grad_norm": 0.6349880464190206, "learning_rate": 9.66919745961086e-06, "loss": 0.011, "step": 502 }, { "epoch": 0.14342743085258056, "grad_norm": 1.3888680526489037, "learning_rate": 9.667543418996824e-06, "loss": 0.0206, "step": 503 }, { "epoch": 0.1437125748502994, "grad_norm": 1.9166676274285204, "learning_rate": 9.665885395708252e-06, "loss": 0.0321, "step": 504 }, { "epoch": 0.14399771884801824, "grad_norm": 0.6986695674839958, "learning_rate": 9.664223391159885e-06, "loss": 0.0222, "step": 505 }, { "epoch": 0.1442828628457371, "grad_norm": 0.9132750549727724, "learning_rate": 9.662557406769865e-06, "loss": 0.026, "step": 506 }, { "epoch": 0.14456800684345594, "grad_norm": 1.5540547156511098, "learning_rate": 9.660887443959726e-06, "loss": 0.039, "step": 507 }, { "epoch": 0.14485315084117478, "grad_norm": 0.9598545046429632, "learning_rate": 9.659213504154393e-06, "loss": 0.0211, "step": 508 }, { "epoch": 0.14513829483889365, "grad_norm": 2.2737404573757862, "learning_rate": 9.65753558878219e-06, "loss": 0.0239, "step": 509 }, { "epoch": 0.1454234388366125, "grad_norm": 1.5062825135884002, "learning_rate": 9.655853699274834e-06, "loss": 0.0345, "step": 510 }, { "epoch": 0.14570858283433133, "grad_norm": 1.6018765019531456, "learning_rate": 9.65416783706743e-06, "loss": 0.0209, "step": 511 }, { "epoch": 0.1459937268320502, "grad_norm": 1.2442982344382103, "learning_rate": 9.652478003598471e-06, "loss": 0.0285, "step": 512 }, { "epoch": 0.14627887082976904, "grad_norm": 1.9216622846090483, "learning_rate": 9.650784200309847e-06, "loss": 0.0339, "step": 513 }, { "epoch": 0.14656401482748788, "grad_norm": 2.3825991592561717, "learning_rate": 9.64908642864682e-06, "loss": 0.044, "step": 514 }, { "epoch": 0.14684915882520672, "grad_norm": 2.5818323385820916, "learning_rate": 9.647384690058058e-06, "loss": 0.0353, "step": 515 }, { "epoch": 0.14713430282292558, "grad_norm": 1.5120506419436006, "learning_rate": 9.645678985995597e-06, "loss": 0.0184, "step": 516 }, { "epoch": 0.14741944682064442, "grad_norm": 3.092529038403767, "learning_rate": 9.643969317914865e-06, "loss": 0.0459, "step": 517 }, { "epoch": 0.14770459081836326, "grad_norm": 1.6305582095574176, "learning_rate": 9.642255687274669e-06, "loss": 0.0745, "step": 518 }, { "epoch": 0.14798973481608213, "grad_norm": 1.9745427124068613, "learning_rate": 9.6405380955372e-06, "loss": 0.0431, "step": 519 }, { "epoch": 0.14827487881380097, "grad_norm": 1.1060350887232473, "learning_rate": 9.638816544168027e-06, "loss": 0.0401, "step": 520 }, { "epoch": 0.1485600228115198, "grad_norm": 1.48122125532716, "learning_rate": 9.637091034636097e-06, "loss": 0.0523, "step": 521 }, { "epoch": 0.14884516680923868, "grad_norm": 1.7854176889849915, "learning_rate": 9.635361568413739e-06, "loss": 0.0392, "step": 522 }, { "epoch": 0.14913031080695752, "grad_norm": 1.5771118563507494, "learning_rate": 9.633628146976649e-06, "loss": 0.0291, "step": 523 }, { "epoch": 0.14941545480467636, "grad_norm": 1.170813729239735, "learning_rate": 9.631890771803909e-06, "loss": 0.0324, "step": 524 }, { "epoch": 0.1497005988023952, "grad_norm": 1.1607083768274689, "learning_rate": 9.630149444377964e-06, "loss": 0.027, "step": 525 }, { "epoch": 0.14998574280011406, "grad_norm": 2.6420375275270946, "learning_rate": 9.628404166184639e-06, "loss": 0.0392, "step": 526 }, { "epoch": 0.1502708867978329, "grad_norm": 1.5977547719023326, "learning_rate": 9.626654938713128e-06, "loss": 0.039, "step": 527 }, { "epoch": 0.15055603079555174, "grad_norm": 2.4449410251819055, "learning_rate": 9.624901763455994e-06, "loss": 0.1085, "step": 528 }, { "epoch": 0.1508411747932706, "grad_norm": 1.66447527220779, "learning_rate": 9.623144641909167e-06, "loss": 0.0453, "step": 529 }, { "epoch": 0.15112631879098945, "grad_norm": 3.13057218774973, "learning_rate": 9.621383575571948e-06, "loss": 0.057, "step": 530 }, { "epoch": 0.1514114627887083, "grad_norm": 1.2160935811246358, "learning_rate": 9.619618565947e-06, "loss": 0.0424, "step": 531 }, { "epoch": 0.15169660678642716, "grad_norm": 1.4641008759404945, "learning_rate": 9.617849614540356e-06, "loss": 0.048, "step": 532 }, { "epoch": 0.151981750784146, "grad_norm": 1.4923618640634606, "learning_rate": 9.616076722861406e-06, "loss": 0.035, "step": 533 }, { "epoch": 0.15226689478186484, "grad_norm": 0.7354465216068986, "learning_rate": 9.614299892422905e-06, "loss": 0.0328, "step": 534 }, { "epoch": 0.15255203877958368, "grad_norm": 1.3156709351195746, "learning_rate": 9.61251912474097e-06, "loss": 0.0295, "step": 535 }, { "epoch": 0.15283718277730254, "grad_norm": 2.465389798366464, "learning_rate": 9.610734421335078e-06, "loss": 0.0355, "step": 536 }, { "epoch": 0.15312232677502138, "grad_norm": 2.107752927710834, "learning_rate": 9.608945783728061e-06, "loss": 0.0304, "step": 537 }, { "epoch": 0.15340747077274022, "grad_norm": 0.8382376238641022, "learning_rate": 9.60715321344611e-06, "loss": 0.0292, "step": 538 }, { "epoch": 0.1536926147704591, "grad_norm": 1.3467398331487281, "learning_rate": 9.605356712018773e-06, "loss": 0.0385, "step": 539 }, { "epoch": 0.15397775876817793, "grad_norm": 2.0809206260900552, "learning_rate": 9.603556280978947e-06, "loss": 0.0572, "step": 540 }, { "epoch": 0.15426290276589677, "grad_norm": 1.2901980818849303, "learning_rate": 9.60175192186289e-06, "loss": 0.0193, "step": 541 }, { "epoch": 0.15454804676361564, "grad_norm": 0.736806847215193, "learning_rate": 9.599943636210204e-06, "loss": 0.0284, "step": 542 }, { "epoch": 0.15483319076133448, "grad_norm": 1.5679254655317187, "learning_rate": 9.598131425563847e-06, "loss": 0.0317, "step": 543 }, { "epoch": 0.15511833475905332, "grad_norm": 0.7132612878107385, "learning_rate": 9.596315291470122e-06, "loss": 0.0114, "step": 544 }, { "epoch": 0.15540347875677216, "grad_norm": 2.593670926612122, "learning_rate": 9.594495235478685e-06, "loss": 0.0463, "step": 545 }, { "epoch": 0.15568862275449102, "grad_norm": 1.1955510290910427, "learning_rate": 9.59267125914253e-06, "loss": 0.0254, "step": 546 }, { "epoch": 0.15597376675220986, "grad_norm": 1.045769334119973, "learning_rate": 9.590843364018005e-06, "loss": 0.029, "step": 547 }, { "epoch": 0.1562589107499287, "grad_norm": 1.6801520699380361, "learning_rate": 9.589011551664797e-06, "loss": 0.0572, "step": 548 }, { "epoch": 0.15654405474764757, "grad_norm": 1.5891642039931364, "learning_rate": 9.587175823645936e-06, "loss": 0.0296, "step": 549 }, { "epoch": 0.1568291987453664, "grad_norm": 1.4409334872176962, "learning_rate": 9.585336181527795e-06, "loss": 0.0264, "step": 550 }, { "epoch": 0.15711434274308525, "grad_norm": 2.2586423562130116, "learning_rate": 9.583492626880082e-06, "loss": 0.058, "step": 551 }, { "epoch": 0.15739948674080412, "grad_norm": 1.661999933017668, "learning_rate": 9.581645161275852e-06, "loss": 0.0551, "step": 552 }, { "epoch": 0.15768463073852296, "grad_norm": 1.7655597655232174, "learning_rate": 9.579793786291486e-06, "loss": 0.0278, "step": 553 }, { "epoch": 0.1579697747362418, "grad_norm": 3.589911729513461, "learning_rate": 9.577938503506712e-06, "loss": 0.0517, "step": 554 }, { "epoch": 0.15825491873396064, "grad_norm": 3.001411920745496, "learning_rate": 9.576079314504584e-06, "loss": 0.0449, "step": 555 }, { "epoch": 0.1585400627316795, "grad_norm": 1.604218093067663, "learning_rate": 9.574216220871492e-06, "loss": 0.0246, "step": 556 }, { "epoch": 0.15882520672939834, "grad_norm": 0.935156656620672, "learning_rate": 9.57234922419716e-06, "loss": 0.0161, "step": 557 }, { "epoch": 0.15911035072711718, "grad_norm": 1.1513995231538736, "learning_rate": 9.570478326074638e-06, "loss": 0.0283, "step": 558 }, { "epoch": 0.15939549472483605, "grad_norm": 1.4012387052969366, "learning_rate": 9.568603528100306e-06, "loss": 0.0325, "step": 559 }, { "epoch": 0.1596806387225549, "grad_norm": 1.5162153854252265, "learning_rate": 9.566724831873876e-06, "loss": 0.0225, "step": 560 }, { "epoch": 0.15996578272027373, "grad_norm": 0.846516598739829, "learning_rate": 9.564842238998381e-06, "loss": 0.0099, "step": 561 }, { "epoch": 0.1602509267179926, "grad_norm": 2.647135598055157, "learning_rate": 9.562955751080183e-06, "loss": 0.0532, "step": 562 }, { "epoch": 0.16053607071571144, "grad_norm": 1.073278252413763, "learning_rate": 9.561065369728963e-06, "loss": 0.0355, "step": 563 }, { "epoch": 0.16082121471343028, "grad_norm": 1.170548007164538, "learning_rate": 9.559171096557728e-06, "loss": 0.029, "step": 564 }, { "epoch": 0.16110635871114912, "grad_norm": 1.9939002606252199, "learning_rate": 9.557272933182804e-06, "loss": 0.036, "step": 565 }, { "epoch": 0.16139150270886798, "grad_norm": 2.1480044032388466, "learning_rate": 9.555370881223837e-06, "loss": 0.0436, "step": 566 }, { "epoch": 0.16167664670658682, "grad_norm": 1.325637196128801, "learning_rate": 9.55346494230379e-06, "loss": 0.0157, "step": 567 }, { "epoch": 0.16196179070430566, "grad_norm": 0.7564851499202597, "learning_rate": 9.551555118048943e-06, "loss": 0.0249, "step": 568 }, { "epoch": 0.16224693470202453, "grad_norm": 0.9722401904633631, "learning_rate": 9.549641410088895e-06, "loss": 0.0231, "step": 569 }, { "epoch": 0.16253207869974337, "grad_norm": 0.8478045120928479, "learning_rate": 9.547723820056552e-06, "loss": 0.0105, "step": 570 }, { "epoch": 0.1628172226974622, "grad_norm": 1.351787113285877, "learning_rate": 9.545802349588136e-06, "loss": 0.0311, "step": 571 }, { "epoch": 0.16310236669518108, "grad_norm": 0.836648012433114, "learning_rate": 9.543877000323181e-06, "loss": 0.023, "step": 572 }, { "epoch": 0.16338751069289992, "grad_norm": 0.7901380885568204, "learning_rate": 9.54194777390453e-06, "loss": 0.014, "step": 573 }, { "epoch": 0.16367265469061876, "grad_norm": 1.2938393184716226, "learning_rate": 9.540014671978335e-06, "loss": 0.0632, "step": 574 }, { "epoch": 0.1639577986883376, "grad_norm": 1.2836949271527522, "learning_rate": 9.53807769619405e-06, "loss": 0.0231, "step": 575 }, { "epoch": 0.16424294268605646, "grad_norm": 2.27381818725564, "learning_rate": 9.536136848204443e-06, "loss": 0.0418, "step": 576 }, { "epoch": 0.1645280866837753, "grad_norm": 2.4569515266903026, "learning_rate": 9.534192129665578e-06, "loss": 0.0141, "step": 577 }, { "epoch": 0.16481323068149414, "grad_norm": 1.7019979551998075, "learning_rate": 9.532243542236826e-06, "loss": 0.0159, "step": 578 }, { "epoch": 0.165098374679213, "grad_norm": 0.7491727489330064, "learning_rate": 9.530291087580857e-06, "loss": 0.011, "step": 579 }, { "epoch": 0.16538351867693185, "grad_norm": 1.0697725490475511, "learning_rate": 9.528334767363643e-06, "loss": 0.0316, "step": 580 }, { "epoch": 0.1656686626746507, "grad_norm": 1.514917067065579, "learning_rate": 9.526374583254454e-06, "loss": 0.0247, "step": 581 }, { "epoch": 0.16595380667236956, "grad_norm": 1.6526372073335776, "learning_rate": 9.524410536925854e-06, "loss": 0.0242, "step": 582 }, { "epoch": 0.1662389506700884, "grad_norm": 1.154206493649726, "learning_rate": 9.522442630053708e-06, "loss": 0.0151, "step": 583 }, { "epoch": 0.16652409466780724, "grad_norm": 2.220783739275899, "learning_rate": 9.520470864317169e-06, "loss": 0.0405, "step": 584 }, { "epoch": 0.16680923866552608, "grad_norm": 2.1478234313482005, "learning_rate": 9.518495241398684e-06, "loss": 0.0255, "step": 585 }, { "epoch": 0.16709438266324494, "grad_norm": 1.888990019043023, "learning_rate": 9.516515762983996e-06, "loss": 0.023, "step": 586 }, { "epoch": 0.16737952666096378, "grad_norm": 0.568362822177166, "learning_rate": 9.514532430762133e-06, "loss": 0.0234, "step": 587 }, { "epoch": 0.16766467065868262, "grad_norm": 2.666053376987735, "learning_rate": 9.512545246425416e-06, "loss": 0.0334, "step": 588 }, { "epoch": 0.1679498146564015, "grad_norm": 0.9889048397741379, "learning_rate": 9.510554211669443e-06, "loss": 0.0272, "step": 589 }, { "epoch": 0.16823495865412033, "grad_norm": 2.663367144657973, "learning_rate": 9.50855932819311e-06, "loss": 0.0863, "step": 590 }, { "epoch": 0.16852010265183917, "grad_norm": 3.0842378390540564, "learning_rate": 9.506560597698588e-06, "loss": 0.0544, "step": 591 }, { "epoch": 0.16880524664955804, "grad_norm": 2.2352938418828945, "learning_rate": 9.504558021891335e-06, "loss": 0.0597, "step": 592 }, { "epoch": 0.16909039064727688, "grad_norm": 0.9718298379039145, "learning_rate": 9.502551602480087e-06, "loss": 0.0243, "step": 593 }, { "epoch": 0.16937553464499572, "grad_norm": 0.9679204345252653, "learning_rate": 9.500541341176865e-06, "loss": 0.0226, "step": 594 }, { "epoch": 0.16966067864271456, "grad_norm": 1.9624333575904165, "learning_rate": 9.498527239696962e-06, "loss": 0.0303, "step": 595 }, { "epoch": 0.16994582264043342, "grad_norm": 1.0983438075432492, "learning_rate": 9.496509299758949e-06, "loss": 0.0263, "step": 596 }, { "epoch": 0.17023096663815226, "grad_norm": 1.485540944433959, "learning_rate": 9.494487523084676e-06, "loss": 0.018, "step": 597 }, { "epoch": 0.1705161106358711, "grad_norm": 0.7546809410329396, "learning_rate": 9.492461911399265e-06, "loss": 0.0208, "step": 598 }, { "epoch": 0.17080125463358997, "grad_norm": 1.2028050799273944, "learning_rate": 9.490432466431107e-06, "loss": 0.0126, "step": 599 }, { "epoch": 0.1710863986313088, "grad_norm": 2.3029305869735377, "learning_rate": 9.488399189911866e-06, "loss": 0.0396, "step": 600 }, { "epoch": 0.17137154262902765, "grad_norm": 1.7854430171080835, "learning_rate": 9.486362083576479e-06, "loss": 0.0228, "step": 601 }, { "epoch": 0.17165668662674652, "grad_norm": 1.2139930054544301, "learning_rate": 9.484321149163145e-06, "loss": 0.053, "step": 602 }, { "epoch": 0.17194183062446536, "grad_norm": 1.8652637627111008, "learning_rate": 9.482276388413331e-06, "loss": 0.0243, "step": 603 }, { "epoch": 0.1722269746221842, "grad_norm": 1.733593841920375, "learning_rate": 9.480227803071775e-06, "loss": 0.0305, "step": 604 }, { "epoch": 0.17251211861990304, "grad_norm": 2.218846595287222, "learning_rate": 9.478175394886469e-06, "loss": 0.0345, "step": 605 }, { "epoch": 0.1727972626176219, "grad_norm": 2.4849800012227345, "learning_rate": 9.47611916560867e-06, "loss": 0.0359, "step": 606 }, { "epoch": 0.17308240661534074, "grad_norm": 3.832248662370049, "learning_rate": 9.474059116992901e-06, "loss": 0.0489, "step": 607 }, { "epoch": 0.17336755061305958, "grad_norm": 1.2556167721835865, "learning_rate": 9.471995250796936e-06, "loss": 0.0224, "step": 608 }, { "epoch": 0.17365269461077845, "grad_norm": 1.3725885311717074, "learning_rate": 9.469927568781814e-06, "loss": 0.0246, "step": 609 }, { "epoch": 0.1739378386084973, "grad_norm": 2.0268680892167312, "learning_rate": 9.467856072711821e-06, "loss": 0.0377, "step": 610 }, { "epoch": 0.17422298260621613, "grad_norm": 2.13735531583806, "learning_rate": 9.465780764354505e-06, "loss": 0.032, "step": 611 }, { "epoch": 0.174508126603935, "grad_norm": 1.0077175687597997, "learning_rate": 9.463701645480665e-06, "loss": 0.0125, "step": 612 }, { "epoch": 0.17479327060165384, "grad_norm": 1.4433624480865979, "learning_rate": 9.46161871786435e-06, "loss": 0.0213, "step": 613 }, { "epoch": 0.17507841459937268, "grad_norm": 1.7363894985553223, "learning_rate": 9.459531983282858e-06, "loss": 0.0312, "step": 614 }, { "epoch": 0.17536355859709152, "grad_norm": 1.1480559254413287, "learning_rate": 9.45744144351674e-06, "loss": 0.0126, "step": 615 }, { "epoch": 0.17564870259481039, "grad_norm": 1.0144730105958946, "learning_rate": 9.455347100349785e-06, "loss": 0.0465, "step": 616 }, { "epoch": 0.17593384659252922, "grad_norm": 2.3315897375286956, "learning_rate": 9.453248955569041e-06, "loss": 0.0375, "step": 617 }, { "epoch": 0.17621899059024806, "grad_norm": 1.517116537675244, "learning_rate": 9.451147010964786e-06, "loss": 0.0255, "step": 618 }, { "epoch": 0.17650413458796693, "grad_norm": 1.3539214290926733, "learning_rate": 9.449041268330549e-06, "loss": 0.0175, "step": 619 }, { "epoch": 0.17678927858568577, "grad_norm": 0.8250630045444727, "learning_rate": 9.446931729463093e-06, "loss": 0.0189, "step": 620 }, { "epoch": 0.1770744225834046, "grad_norm": 1.6566428427949393, "learning_rate": 9.44481839616243e-06, "loss": 0.0263, "step": 621 }, { "epoch": 0.17735956658112348, "grad_norm": 1.296549496531528, "learning_rate": 9.442701270231799e-06, "loss": 0.0357, "step": 622 }, { "epoch": 0.17764471057884232, "grad_norm": 1.2623177453839831, "learning_rate": 9.440580353477682e-06, "loss": 0.0138, "step": 623 }, { "epoch": 0.17792985457656116, "grad_norm": 0.9401424954471435, "learning_rate": 9.438455647709794e-06, "loss": 0.0344, "step": 624 }, { "epoch": 0.17821499857428, "grad_norm": 1.2061707620169295, "learning_rate": 9.436327154741082e-06, "loss": 0.0204, "step": 625 }, { "epoch": 0.17850014257199887, "grad_norm": 3.5812415668133295, "learning_rate": 9.434194876387723e-06, "loss": 0.0348, "step": 626 }, { "epoch": 0.1787852865697177, "grad_norm": 2.179594593613911, "learning_rate": 9.43205881446913e-06, "loss": 0.0405, "step": 627 }, { "epoch": 0.17907043056743654, "grad_norm": 3.133447888160429, "learning_rate": 9.429918970807939e-06, "loss": 0.0883, "step": 628 }, { "epoch": 0.1793555745651554, "grad_norm": 1.1894173921814282, "learning_rate": 9.427775347230013e-06, "loss": 0.0459, "step": 629 }, { "epoch": 0.17964071856287425, "grad_norm": 1.220795451723468, "learning_rate": 9.425627945564442e-06, "loss": 0.0276, "step": 630 }, { "epoch": 0.1799258625605931, "grad_norm": 0.9556676101760306, "learning_rate": 9.423476767643539e-06, "loss": 0.0408, "step": 631 }, { "epoch": 0.18021100655831196, "grad_norm": 1.3012772103524362, "learning_rate": 9.42132181530284e-06, "loss": 0.0147, "step": 632 }, { "epoch": 0.1804961505560308, "grad_norm": 0.8734603524960712, "learning_rate": 9.419163090381102e-06, "loss": 0.0232, "step": 633 }, { "epoch": 0.18078129455374964, "grad_norm": 1.2200670729025649, "learning_rate": 9.4170005947203e-06, "loss": 0.0329, "step": 634 }, { "epoch": 0.18106643855146848, "grad_norm": 1.1685662362242049, "learning_rate": 9.414834330165626e-06, "loss": 0.0372, "step": 635 }, { "epoch": 0.18135158254918735, "grad_norm": 0.6949628469219308, "learning_rate": 9.412664298565486e-06, "loss": 0.0246, "step": 636 }, { "epoch": 0.18163672654690619, "grad_norm": 1.771280848314573, "learning_rate": 9.410490501771507e-06, "loss": 0.0306, "step": 637 }, { "epoch": 0.18192187054462503, "grad_norm": 1.2498455618438535, "learning_rate": 9.408312941638522e-06, "loss": 0.0301, "step": 638 }, { "epoch": 0.1822070145423439, "grad_norm": 2.278447179095516, "learning_rate": 9.406131620024576e-06, "loss": 0.0466, "step": 639 }, { "epoch": 0.18249215854006273, "grad_norm": 1.0355835464758893, "learning_rate": 9.403946538790931e-06, "loss": 0.023, "step": 640 }, { "epoch": 0.18277730253778157, "grad_norm": 0.683840225577905, "learning_rate": 9.401757699802046e-06, "loss": 0.0171, "step": 641 }, { "epoch": 0.18306244653550044, "grad_norm": 2.090222100453515, "learning_rate": 9.399565104925591e-06, "loss": 0.0425, "step": 642 }, { "epoch": 0.18334759053321928, "grad_norm": 2.036391317556621, "learning_rate": 9.397368756032445e-06, "loss": 0.0266, "step": 643 }, { "epoch": 0.18363273453093812, "grad_norm": 1.738387885682237, "learning_rate": 9.395168654996685e-06, "loss": 0.0519, "step": 644 }, { "epoch": 0.18391787852865696, "grad_norm": 2.117881545376208, "learning_rate": 9.392964803695592e-06, "loss": 0.032, "step": 645 }, { "epoch": 0.18420302252637583, "grad_norm": 0.6059383955679531, "learning_rate": 9.390757204009644e-06, "loss": 0.0083, "step": 646 }, { "epoch": 0.18448816652409467, "grad_norm": 0.5792275998008327, "learning_rate": 9.38854585782252e-06, "loss": 0.0127, "step": 647 }, { "epoch": 0.1847733105218135, "grad_norm": 0.9221379221878636, "learning_rate": 9.386330767021098e-06, "loss": 0.0255, "step": 648 }, { "epoch": 0.18505845451953237, "grad_norm": 0.6920928264035611, "learning_rate": 9.384111933495442e-06, "loss": 0.0084, "step": 649 }, { "epoch": 0.1853435985172512, "grad_norm": 1.3255760857611412, "learning_rate": 9.381889359138823e-06, "loss": 0.0191, "step": 650 }, { "epoch": 0.18562874251497005, "grad_norm": 1.056441268866933, "learning_rate": 9.379663045847693e-06, "loss": 0.0479, "step": 651 }, { "epoch": 0.18591388651268892, "grad_norm": 2.0724011021981594, "learning_rate": 9.377432995521701e-06, "loss": 0.0248, "step": 652 }, { "epoch": 0.18619903051040776, "grad_norm": 1.7914795536241521, "learning_rate": 9.375199210063676e-06, "loss": 0.0342, "step": 653 }, { "epoch": 0.1864841745081266, "grad_norm": 1.540949023356334, "learning_rate": 9.372961691379649e-06, "loss": 0.0599, "step": 654 }, { "epoch": 0.18676931850584544, "grad_norm": 0.6650657087107381, "learning_rate": 9.37072044137882e-06, "loss": 0.0148, "step": 655 }, { "epoch": 0.1870544625035643, "grad_norm": 1.9248212315910453, "learning_rate": 9.368475461973582e-06, "loss": 0.0375, "step": 656 }, { "epoch": 0.18733960650128315, "grad_norm": 2.808110344533141, "learning_rate": 9.366226755079513e-06, "loss": 0.0422, "step": 657 }, { "epoch": 0.18762475049900199, "grad_norm": 2.1184357826543194, "learning_rate": 9.36397432261536e-06, "loss": 0.0282, "step": 658 }, { "epoch": 0.18790989449672085, "grad_norm": 1.5031809588854805, "learning_rate": 9.361718166503062e-06, "loss": 0.0151, "step": 659 }, { "epoch": 0.1881950384944397, "grad_norm": 0.9601384811433088, "learning_rate": 9.359458288667725e-06, "loss": 0.0414, "step": 660 }, { "epoch": 0.18848018249215853, "grad_norm": 2.0522251099402053, "learning_rate": 9.357194691037637e-06, "loss": 0.0436, "step": 661 }, { "epoch": 0.1887653264898774, "grad_norm": 0.8416753384643524, "learning_rate": 9.354927375544256e-06, "loss": 0.0124, "step": 662 }, { "epoch": 0.18905047048759624, "grad_norm": 1.0668298335424686, "learning_rate": 9.352656344122216e-06, "loss": 0.0439, "step": 663 }, { "epoch": 0.18933561448531508, "grad_norm": 1.34820663049341, "learning_rate": 9.350381598709319e-06, "loss": 0.0305, "step": 664 }, { "epoch": 0.18962075848303392, "grad_norm": 1.5705169220785646, "learning_rate": 9.348103141246538e-06, "loss": 0.0191, "step": 665 }, { "epoch": 0.1899059024807528, "grad_norm": 2.322186069862557, "learning_rate": 9.345820973678011e-06, "loss": 0.0284, "step": 666 }, { "epoch": 0.19019104647847163, "grad_norm": 1.344929313625686, "learning_rate": 9.343535097951044e-06, "loss": 0.0343, "step": 667 }, { "epoch": 0.19047619047619047, "grad_norm": 1.1700086372233123, "learning_rate": 9.341245516016105e-06, "loss": 0.018, "step": 668 }, { "epoch": 0.19076133447390933, "grad_norm": 1.0706753371606232, "learning_rate": 9.338952229826825e-06, "loss": 0.0273, "step": 669 }, { "epoch": 0.19104647847162817, "grad_norm": 1.6254449347628146, "learning_rate": 9.336655241339999e-06, "loss": 0.0367, "step": 670 }, { "epoch": 0.191331622469347, "grad_norm": 67.18866812915265, "learning_rate": 9.334354552515576e-06, "loss": 0.0859, "step": 671 }, { "epoch": 0.19161676646706588, "grad_norm": 191.39881973611224, "learning_rate": 9.332050165316664e-06, "loss": 0.5247, "step": 672 }, { "epoch": 0.19190191046478472, "grad_norm": 33.27059153720094, "learning_rate": 9.32974208170953e-06, "loss": 0.0762, "step": 673 }, { "epoch": 0.19218705446250356, "grad_norm": 2.1745024710329655, "learning_rate": 9.327430303663589e-06, "loss": 0.0331, "step": 674 }, { "epoch": 0.1924721984602224, "grad_norm": 0.958496001953392, "learning_rate": 9.325114833151414e-06, "loss": 0.0174, "step": 675 }, { "epoch": 0.19275734245794127, "grad_norm": 1.5630060617545107, "learning_rate": 9.322795672148726e-06, "loss": 0.0434, "step": 676 }, { "epoch": 0.1930424864556601, "grad_norm": 4.90486633391519, "learning_rate": 9.320472822634395e-06, "loss": 0.0542, "step": 677 }, { "epoch": 0.19332763045337895, "grad_norm": 0.5026753644154698, "learning_rate": 9.31814628659044e-06, "loss": 0.0108, "step": 678 }, { "epoch": 0.1936127744510978, "grad_norm": 2.318017474873092, "learning_rate": 9.315816066002024e-06, "loss": 0.0408, "step": 679 }, { "epoch": 0.19389791844881665, "grad_norm": 2.903171610551692, "learning_rate": 9.313482162857452e-06, "loss": 0.0557, "step": 680 }, { "epoch": 0.1941830624465355, "grad_norm": 1.4218762758102965, "learning_rate": 9.311144579148175e-06, "loss": 0.0159, "step": 681 }, { "epoch": 0.19446820644425436, "grad_norm": 2.009139671030263, "learning_rate": 9.308803316868783e-06, "loss": 0.0458, "step": 682 }, { "epoch": 0.1947533504419732, "grad_norm": 1.6113043050375866, "learning_rate": 9.306458378017004e-06, "loss": 0.0678, "step": 683 }, { "epoch": 0.19503849443969204, "grad_norm": 3.4623487395539505, "learning_rate": 9.304109764593705e-06, "loss": 0.0398, "step": 684 }, { "epoch": 0.19532363843741088, "grad_norm": 2.3381560335801646, "learning_rate": 9.301757478602886e-06, "loss": 0.0445, "step": 685 }, { "epoch": 0.19560878243512975, "grad_norm": 64.94755869755691, "learning_rate": 9.299401522051685e-06, "loss": 1.2332, "step": 686 }, { "epoch": 0.1958939264328486, "grad_norm": 18.353718727926776, "learning_rate": 9.297041896950365e-06, "loss": 0.1454, "step": 687 }, { "epoch": 0.19617907043056743, "grad_norm": 0.6617552332955904, "learning_rate": 9.294678605312323e-06, "loss": 0.0201, "step": 688 }, { "epoch": 0.1964642144282863, "grad_norm": 1.1973919773785329, "learning_rate": 9.292311649154088e-06, "loss": 0.0133, "step": 689 }, { "epoch": 0.19674935842600513, "grad_norm": 2.181331281045211, "learning_rate": 9.289941030495313e-06, "loss": 0.0279, "step": 690 }, { "epoch": 0.19703450242372397, "grad_norm": 11.102336408880397, "learning_rate": 9.287566751358773e-06, "loss": 0.0318, "step": 691 }, { "epoch": 0.19731964642144284, "grad_norm": 0.9201931408854377, "learning_rate": 9.285188813770368e-06, "loss": 0.025, "step": 692 }, { "epoch": 0.19760479041916168, "grad_norm": 1.508525890818712, "learning_rate": 9.282807219759123e-06, "loss": 0.0414, "step": 693 }, { "epoch": 0.19788993441688052, "grad_norm": 2.464653813173033, "learning_rate": 9.280421971357181e-06, "loss": 0.0389, "step": 694 }, { "epoch": 0.19817507841459936, "grad_norm": 0.8743425835485196, "learning_rate": 9.2780330705998e-06, "loss": 0.0165, "step": 695 }, { "epoch": 0.19846022241231823, "grad_norm": 1.8205084856824216, "learning_rate": 9.27564051952536e-06, "loss": 0.0286, "step": 696 }, { "epoch": 0.19874536641003707, "grad_norm": 1.3224132996211422, "learning_rate": 9.273244320175352e-06, "loss": 0.0333, "step": 697 }, { "epoch": 0.1990305104077559, "grad_norm": 1.0636884242784166, "learning_rate": 9.270844474594381e-06, "loss": 0.0165, "step": 698 }, { "epoch": 0.19931565440547477, "grad_norm": 1.0517832694626335, "learning_rate": 9.268440984830163e-06, "loss": 0.0186, "step": 699 }, { "epoch": 0.1996007984031936, "grad_norm": 1.0627875282801584, "learning_rate": 9.266033852933525e-06, "loss": 0.0299, "step": 700 }, { "epoch": 0.19988594240091245, "grad_norm": 0.7272225161607353, "learning_rate": 9.263623080958398e-06, "loss": 0.0062, "step": 701 }, { "epoch": 0.20017108639863132, "grad_norm": 1.847720921414192, "learning_rate": 9.26120867096182e-06, "loss": 0.0506, "step": 702 }, { "epoch": 0.20045623039635016, "grad_norm": 1.6823362190282132, "learning_rate": 9.258790625003939e-06, "loss": 0.0653, "step": 703 }, { "epoch": 0.200741374394069, "grad_norm": 1.4787235791879039, "learning_rate": 9.256368945147998e-06, "loss": 0.0538, "step": 704 }, { "epoch": 0.20102651839178784, "grad_norm": 1.2040529131598932, "learning_rate": 9.253943633460344e-06, "loss": 0.0175, "step": 705 }, { "epoch": 0.2013116623895067, "grad_norm": 1.6910714189347542, "learning_rate": 9.251514692010423e-06, "loss": 0.0349, "step": 706 }, { "epoch": 0.20159680638722555, "grad_norm": 1.472948498746407, "learning_rate": 9.249082122870779e-06, "loss": 0.0369, "step": 707 }, { "epoch": 0.2018819503849444, "grad_norm": 0.8762692780359592, "learning_rate": 9.246645928117047e-06, "loss": 0.0224, "step": 708 }, { "epoch": 0.20216709438266325, "grad_norm": 2.4349769570385162, "learning_rate": 9.24420610982796e-06, "loss": 0.0254, "step": 709 }, { "epoch": 0.2024522383803821, "grad_norm": 1.9112438219894465, "learning_rate": 9.241762670085343e-06, "loss": 0.0455, "step": 710 }, { "epoch": 0.20273738237810093, "grad_norm": 2.434206504065968, "learning_rate": 9.239315610974109e-06, "loss": 0.0428, "step": 711 }, { "epoch": 0.2030225263758198, "grad_norm": 1.6611178690442459, "learning_rate": 9.236864934582259e-06, "loss": 0.0311, "step": 712 }, { "epoch": 0.20330767037353864, "grad_norm": 2.7886821146690237, "learning_rate": 9.234410643000884e-06, "loss": 0.0471, "step": 713 }, { "epoch": 0.20359281437125748, "grad_norm": 1.7169627167940775, "learning_rate": 9.231952738324155e-06, "loss": 0.0302, "step": 714 }, { "epoch": 0.20387795836897632, "grad_norm": 2.5621764097108652, "learning_rate": 9.229491222649328e-06, "loss": 0.0485, "step": 715 }, { "epoch": 0.2041631023666952, "grad_norm": 1.17459682130259, "learning_rate": 9.227026098076742e-06, "loss": 0.0264, "step": 716 }, { "epoch": 0.20444824636441403, "grad_norm": 1.3071201080016008, "learning_rate": 9.224557366709813e-06, "loss": 0.0255, "step": 717 }, { "epoch": 0.20473339036213287, "grad_norm": 2.051560890960995, "learning_rate": 9.222085030655035e-06, "loss": 0.0476, "step": 718 }, { "epoch": 0.20501853435985173, "grad_norm": 1.019980133767295, "learning_rate": 9.219609092021976e-06, "loss": 0.0202, "step": 719 }, { "epoch": 0.20530367835757057, "grad_norm": 0.8053908676837997, "learning_rate": 9.217129552923287e-06, "loss": 0.0144, "step": 720 }, { "epoch": 0.2055888223552894, "grad_norm": 0.44915310802273345, "learning_rate": 9.214646415474676e-06, "loss": 0.0188, "step": 721 }, { "epoch": 0.20587396635300828, "grad_norm": 2.0651332669928664, "learning_rate": 9.212159681794935e-06, "loss": 0.0576, "step": 722 }, { "epoch": 0.20615911035072712, "grad_norm": 1.1845182088251505, "learning_rate": 9.209669354005915e-06, "loss": 0.0251, "step": 723 }, { "epoch": 0.20644425434844596, "grad_norm": 0.8224946140289474, "learning_rate": 9.20717543423254e-06, "loss": 0.0397, "step": 724 }, { "epoch": 0.2067293983461648, "grad_norm": 0.922555862579926, "learning_rate": 9.204677924602799e-06, "loss": 0.0228, "step": 725 }, { "epoch": 0.20701454234388367, "grad_norm": 0.7195816655824018, "learning_rate": 9.202176827247739e-06, "loss": 0.0234, "step": 726 }, { "epoch": 0.2072996863416025, "grad_norm": 1.5777936992378956, "learning_rate": 9.19967214430147e-06, "loss": 0.047, "step": 727 }, { "epoch": 0.20758483033932135, "grad_norm": 2.115593197700014, "learning_rate": 9.197163877901167e-06, "loss": 0.0804, "step": 728 }, { "epoch": 0.20786997433704021, "grad_norm": 1.7316346105054923, "learning_rate": 9.194652030187055e-06, "loss": 0.0452, "step": 729 }, { "epoch": 0.20815511833475905, "grad_norm": 1.732715865988259, "learning_rate": 9.19213660330242e-06, "loss": 0.0288, "step": 730 }, { "epoch": 0.2084402623324779, "grad_norm": 2.9257895919417085, "learning_rate": 9.1896175993936e-06, "loss": 0.0551, "step": 731 }, { "epoch": 0.20872540633019676, "grad_norm": 1.2005441423398024, "learning_rate": 9.187095020609982e-06, "loss": 0.0591, "step": 732 }, { "epoch": 0.2090105503279156, "grad_norm": 1.1442567758861109, "learning_rate": 9.18456886910401e-06, "loss": 0.0316, "step": 733 }, { "epoch": 0.20929569432563444, "grad_norm": 1.442435487183275, "learning_rate": 9.182039147031174e-06, "loss": 0.0357, "step": 734 }, { "epoch": 0.20958083832335328, "grad_norm": 0.7389338008585332, "learning_rate": 9.179505856550006e-06, "loss": 0.0213, "step": 735 }, { "epoch": 0.20986598232107215, "grad_norm": 0.5763570642711087, "learning_rate": 9.176968999822091e-06, "loss": 0.0261, "step": 736 }, { "epoch": 0.210151126318791, "grad_norm": 2.9913704549741467, "learning_rate": 9.174428579012051e-06, "loss": 0.0466, "step": 737 }, { "epoch": 0.21043627031650983, "grad_norm": 0.9514972565621266, "learning_rate": 9.171884596287548e-06, "loss": 0.0316, "step": 738 }, { "epoch": 0.2107214143142287, "grad_norm": 1.1166267794394404, "learning_rate": 9.16933705381929e-06, "loss": 0.0425, "step": 739 }, { "epoch": 0.21100655831194753, "grad_norm": 2.6475916702209794, "learning_rate": 9.166785953781017e-06, "loss": 0.0383, "step": 740 }, { "epoch": 0.21129170230966637, "grad_norm": 0.5035517781871393, "learning_rate": 9.164231298349505e-06, "loss": 0.0116, "step": 741 }, { "epoch": 0.21157684630738524, "grad_norm": 2.735386210598088, "learning_rate": 9.161673089704565e-06, "loss": 0.0439, "step": 742 }, { "epoch": 0.21186199030510408, "grad_norm": 1.7132023378847019, "learning_rate": 9.159111330029041e-06, "loss": 0.0428, "step": 743 }, { "epoch": 0.21214713430282292, "grad_norm": 1.2102951850569579, "learning_rate": 9.156546021508803e-06, "loss": 0.0181, "step": 744 }, { "epoch": 0.21243227830054176, "grad_norm": 1.822168441101455, "learning_rate": 9.153977166332756e-06, "loss": 0.038, "step": 745 }, { "epoch": 0.21271742229826063, "grad_norm": 3.1595776380649334, "learning_rate": 9.151404766692822e-06, "loss": 0.0676, "step": 746 }, { "epoch": 0.21300256629597947, "grad_norm": 2.4059297211150947, "learning_rate": 9.148828824783956e-06, "loss": 0.0418, "step": 747 }, { "epoch": 0.2132877102936983, "grad_norm": 1.1863131598693795, "learning_rate": 9.146249342804128e-06, "loss": 0.0179, "step": 748 }, { "epoch": 0.21357285429141717, "grad_norm": 1.6629250296893985, "learning_rate": 9.143666322954336e-06, "loss": 0.0321, "step": 749 }, { "epoch": 0.21385799828913601, "grad_norm": 0.702256216039878, "learning_rate": 9.141079767438592e-06, "loss": 0.0272, "step": 750 }, { "epoch": 0.21414314228685485, "grad_norm": 1.0421114336526116, "learning_rate": 9.138489678463927e-06, "loss": 0.0288, "step": 751 }, { "epoch": 0.21442828628457372, "grad_norm": 1.398071589217176, "learning_rate": 9.135896058240384e-06, "loss": 0.0164, "step": 752 }, { "epoch": 0.21471343028229256, "grad_norm": 1.180627226683535, "learning_rate": 9.133298908981021e-06, "loss": 0.0302, "step": 753 }, { "epoch": 0.2149985742800114, "grad_norm": 2.12849533813395, "learning_rate": 9.13069823290191e-06, "loss": 0.034, "step": 754 }, { "epoch": 0.21528371827773024, "grad_norm": 1.1520445091389948, "learning_rate": 9.12809403222213e-06, "loss": 0.02, "step": 755 }, { "epoch": 0.2155688622754491, "grad_norm": 0.9314697391415853, "learning_rate": 9.125486309163764e-06, "loss": 0.022, "step": 756 }, { "epoch": 0.21585400627316795, "grad_norm": 0.4807671860197455, "learning_rate": 9.122875065951907e-06, "loss": 0.0129, "step": 757 }, { "epoch": 0.2161391502708868, "grad_norm": 1.0284661378275857, "learning_rate": 9.12026030481465e-06, "loss": 0.0198, "step": 758 }, { "epoch": 0.21642429426860565, "grad_norm": 1.3711724506820158, "learning_rate": 9.117642027983096e-06, "loss": 0.0275, "step": 759 }, { "epoch": 0.2167094382663245, "grad_norm": 1.2056627878364008, "learning_rate": 9.115020237691336e-06, "loss": 0.0233, "step": 760 }, { "epoch": 0.21699458226404333, "grad_norm": 1.5615068937947674, "learning_rate": 9.11239493617647e-06, "loss": 0.0472, "step": 761 }, { "epoch": 0.2172797262617622, "grad_norm": 1.053111712057514, "learning_rate": 9.109766125678585e-06, "loss": 0.0236, "step": 762 }, { "epoch": 0.21756487025948104, "grad_norm": 1.492926709547356, "learning_rate": 9.107133808440767e-06, "loss": 0.0278, "step": 763 }, { "epoch": 0.21785001425719988, "grad_norm": 0.7321893976881936, "learning_rate": 9.104497986709096e-06, "loss": 0.011, "step": 764 }, { "epoch": 0.21813515825491872, "grad_norm": 0.972053791654552, "learning_rate": 9.101858662732635e-06, "loss": 0.0284, "step": 765 }, { "epoch": 0.2184203022526376, "grad_norm": 0.384726545541271, "learning_rate": 9.099215838763444e-06, "loss": 0.0091, "step": 766 }, { "epoch": 0.21870544625035643, "grad_norm": 1.4964596931892333, "learning_rate": 9.096569517056562e-06, "loss": 0.0155, "step": 767 }, { "epoch": 0.21899059024807527, "grad_norm": 1.4119719189435824, "learning_rate": 9.093919699870017e-06, "loss": 0.0278, "step": 768 }, { "epoch": 0.21927573424579413, "grad_norm": 2.196204389526829, "learning_rate": 9.091266389464818e-06, "loss": 0.0621, "step": 769 }, { "epoch": 0.21956087824351297, "grad_norm": 1.437174200010841, "learning_rate": 9.088609588104958e-06, "loss": 0.0207, "step": 770 }, { "epoch": 0.21984602224123181, "grad_norm": 2.1986672533499307, "learning_rate": 9.085949298057402e-06, "loss": 0.0357, "step": 771 }, { "epoch": 0.22013116623895068, "grad_norm": 2.0638899957028642, "learning_rate": 9.083285521592097e-06, "loss": 0.0473, "step": 772 }, { "epoch": 0.22041631023666952, "grad_norm": 1.1744345395272808, "learning_rate": 9.080618260981964e-06, "loss": 0.0155, "step": 773 }, { "epoch": 0.22070145423438836, "grad_norm": 2.306169669849833, "learning_rate": 9.077947518502894e-06, "loss": 0.033, "step": 774 }, { "epoch": 0.2209865982321072, "grad_norm": 1.9124745318350345, "learning_rate": 9.075273296433753e-06, "loss": 0.0379, "step": 775 }, { "epoch": 0.22127174222982607, "grad_norm": 2.0494348960481785, "learning_rate": 9.072595597056375e-06, "loss": 0.0671, "step": 776 }, { "epoch": 0.2215568862275449, "grad_norm": 1.6680009681455517, "learning_rate": 9.069914422655559e-06, "loss": 0.0301, "step": 777 }, { "epoch": 0.22184203022526375, "grad_norm": 1.6771017256370313, "learning_rate": 9.067229775519071e-06, "loss": 0.0229, "step": 778 }, { "epoch": 0.22212717422298262, "grad_norm": 2.0275590166360806, "learning_rate": 9.064541657937641e-06, "loss": 0.0685, "step": 779 }, { "epoch": 0.22241231822070145, "grad_norm": 2.360895367226193, "learning_rate": 9.061850072204958e-06, "loss": 0.0299, "step": 780 }, { "epoch": 0.2226974622184203, "grad_norm": 1.5539467978062291, "learning_rate": 9.05915502061767e-06, "loss": 0.0403, "step": 781 }, { "epoch": 0.22298260621613916, "grad_norm": 0.7670508938888166, "learning_rate": 9.056456505475385e-06, "loss": 0.0105, "step": 782 }, { "epoch": 0.223267750213858, "grad_norm": 0.7937880675570071, "learning_rate": 9.053754529080664e-06, "loss": 0.0176, "step": 783 }, { "epoch": 0.22355289421157684, "grad_norm": 2.157831660066521, "learning_rate": 9.051049093739023e-06, "loss": 0.0336, "step": 784 }, { "epoch": 0.22383803820929568, "grad_norm": 1.1663071318976364, "learning_rate": 9.048340201758929e-06, "loss": 0.0252, "step": 785 }, { "epoch": 0.22412318220701455, "grad_norm": 1.292838079011928, "learning_rate": 9.045627855451797e-06, "loss": 0.0363, "step": 786 }, { "epoch": 0.2244083262047334, "grad_norm": 1.5733550446218494, "learning_rate": 9.04291205713199e-06, "loss": 0.0213, "step": 787 }, { "epoch": 0.22469347020245223, "grad_norm": 1.1259643693921413, "learning_rate": 9.04019280911682e-06, "loss": 0.0149, "step": 788 }, { "epoch": 0.2249786142001711, "grad_norm": 1.0083139262164555, "learning_rate": 9.037470113726537e-06, "loss": 0.0172, "step": 789 }, { "epoch": 0.22526375819788994, "grad_norm": 1.3511474029563015, "learning_rate": 9.034743973284337e-06, "loss": 0.0215, "step": 790 }, { "epoch": 0.22554890219560877, "grad_norm": 0.8363495828122983, "learning_rate": 9.032014390116351e-06, "loss": 0.0266, "step": 791 }, { "epoch": 0.22583404619332764, "grad_norm": 3.1833632224433996, "learning_rate": 9.029281366551654e-06, "loss": 0.0254, "step": 792 }, { "epoch": 0.22611919019104648, "grad_norm": 2.1181522596055435, "learning_rate": 9.02654490492225e-06, "loss": 0.034, "step": 793 }, { "epoch": 0.22640433418876532, "grad_norm": 1.125558580379027, "learning_rate": 9.02380500756308e-06, "loss": 0.0244, "step": 794 }, { "epoch": 0.22668947818648416, "grad_norm": 1.2351475059799766, "learning_rate": 9.021061676812016e-06, "loss": 0.0175, "step": 795 }, { "epoch": 0.22697462218420303, "grad_norm": 2.0823417135384426, "learning_rate": 9.01831491500986e-06, "loss": 0.0314, "step": 796 }, { "epoch": 0.22725976618192187, "grad_norm": 2.6273804667622955, "learning_rate": 9.015564724500343e-06, "loss": 0.0439, "step": 797 }, { "epoch": 0.2275449101796407, "grad_norm": 0.5129484219330254, "learning_rate": 9.012811107630118e-06, "loss": 0.0101, "step": 798 }, { "epoch": 0.22783005417735958, "grad_norm": 1.1938003324425006, "learning_rate": 9.010054066748764e-06, "loss": 0.0379, "step": 799 }, { "epoch": 0.22811519817507842, "grad_norm": 3.097898875063007, "learning_rate": 9.00729360420878e-06, "loss": 0.0781, "step": 800 }, { "epoch": 0.22840034217279725, "grad_norm": 1.0145281059549673, "learning_rate": 9.004529722365585e-06, "loss": 0.0137, "step": 801 }, { "epoch": 0.22868548617051612, "grad_norm": 0.9369644953910446, "learning_rate": 9.001762423577521e-06, "loss": 0.0129, "step": 802 }, { "epoch": 0.22897063016823496, "grad_norm": 1.6897375760595272, "learning_rate": 8.998991710205837e-06, "loss": 0.0358, "step": 803 }, { "epoch": 0.2292557741659538, "grad_norm": 1.5083615821843614, "learning_rate": 8.996217584614702e-06, "loss": 0.0299, "step": 804 }, { "epoch": 0.22954091816367264, "grad_norm": 1.5739940630915592, "learning_rate": 8.99344004917119e-06, "loss": 0.0256, "step": 805 }, { "epoch": 0.2298260621613915, "grad_norm": 3.8075761182654593, "learning_rate": 8.990659106245292e-06, "loss": 0.0503, "step": 806 }, { "epoch": 0.23011120615911035, "grad_norm": 0.679718812603675, "learning_rate": 8.9878747582099e-06, "loss": 0.0152, "step": 807 }, { "epoch": 0.2303963501568292, "grad_norm": 1.6636057899743955, "learning_rate": 8.98508700744082e-06, "loss": 0.0236, "step": 808 }, { "epoch": 0.23068149415454806, "grad_norm": 1.0031959304118039, "learning_rate": 8.98229585631675e-06, "loss": 0.0223, "step": 809 }, { "epoch": 0.2309666381522669, "grad_norm": 0.49804281066136624, "learning_rate": 8.979501307219298e-06, "loss": 0.0126, "step": 810 }, { "epoch": 0.23125178214998574, "grad_norm": 2.390993893561002, "learning_rate": 8.976703362532971e-06, "loss": 0.0316, "step": 811 }, { "epoch": 0.2315369261477046, "grad_norm": 1.9859352958275294, "learning_rate": 8.973902024645165e-06, "loss": 0.0164, "step": 812 }, { "epoch": 0.23182207014542344, "grad_norm": 2.609134116114303, "learning_rate": 8.971097295946183e-06, "loss": 0.0511, "step": 813 }, { "epoch": 0.23210721414314228, "grad_norm": 1.3455604419784448, "learning_rate": 8.968289178829214e-06, "loss": 0.0232, "step": 814 }, { "epoch": 0.23239235814086112, "grad_norm": 2.843848539977979, "learning_rate": 8.96547767569034e-06, "loss": 0.0263, "step": 815 }, { "epoch": 0.23267750213858, "grad_norm": 3.7860602043709983, "learning_rate": 8.962662788928531e-06, "loss": 0.0428, "step": 816 }, { "epoch": 0.23296264613629883, "grad_norm": 2.4050408956018763, "learning_rate": 8.959844520945646e-06, "loss": 0.0357, "step": 817 }, { "epoch": 0.23324779013401767, "grad_norm": 2.7114207248581113, "learning_rate": 8.957022874146429e-06, "loss": 0.0348, "step": 818 }, { "epoch": 0.23353293413173654, "grad_norm": 2.5893678936272444, "learning_rate": 8.954197850938506e-06, "loss": 0.0257, "step": 819 }, { "epoch": 0.23381807812945538, "grad_norm": 1.1802173880782538, "learning_rate": 8.951369453732386e-06, "loss": 0.0354, "step": 820 }, { "epoch": 0.23410322212717422, "grad_norm": 0.5538609744161046, "learning_rate": 8.948537684941452e-06, "loss": 0.0121, "step": 821 }, { "epoch": 0.23438836612489308, "grad_norm": 0.7526259552569778, "learning_rate": 8.94570254698197e-06, "loss": 0.0348, "step": 822 }, { "epoch": 0.23467351012261192, "grad_norm": 0.38116984197805204, "learning_rate": 8.942864042273075e-06, "loss": 0.0036, "step": 823 }, { "epoch": 0.23495865412033076, "grad_norm": 1.31857014571736, "learning_rate": 8.94002217323678e-06, "loss": 0.0163, "step": 824 }, { "epoch": 0.2352437981180496, "grad_norm": 2.921327945172875, "learning_rate": 8.937176942297968e-06, "loss": 0.0399, "step": 825 }, { "epoch": 0.23552894211576847, "grad_norm": 1.5144558599662687, "learning_rate": 8.934328351884386e-06, "loss": 0.0382, "step": 826 }, { "epoch": 0.2358140861134873, "grad_norm": 1.2344815677516623, "learning_rate": 8.931476404426653e-06, "loss": 0.0332, "step": 827 }, { "epoch": 0.23609923011120615, "grad_norm": 0.7033882183737051, "learning_rate": 8.928621102358248e-06, "loss": 0.0118, "step": 828 }, { "epoch": 0.23638437410892502, "grad_norm": 6.45022989283852, "learning_rate": 8.925762448115516e-06, "loss": 0.0202, "step": 829 }, { "epoch": 0.23666951810664386, "grad_norm": 2.5229595774880087, "learning_rate": 8.92290044413766e-06, "loss": 0.0368, "step": 830 }, { "epoch": 0.2369546621043627, "grad_norm": 1.258824021540127, "learning_rate": 8.92003509286674e-06, "loss": 0.0202, "step": 831 }, { "epoch": 0.23723980610208156, "grad_norm": 2.6905988020090956, "learning_rate": 8.917166396747681e-06, "loss": 0.0397, "step": 832 }, { "epoch": 0.2375249500998004, "grad_norm": 1.4334425139307863, "learning_rate": 8.914294358228245e-06, "loss": 0.0233, "step": 833 }, { "epoch": 0.23781009409751924, "grad_norm": 3.3639351642610373, "learning_rate": 8.911418979759066e-06, "loss": 0.035, "step": 834 }, { "epoch": 0.23809523809523808, "grad_norm": 1.0725408705087607, "learning_rate": 8.908540263793611e-06, "loss": 0.0357, "step": 835 }, { "epoch": 0.23838038209295695, "grad_norm": 0.8357183521879652, "learning_rate": 8.905658212788207e-06, "loss": 0.0134, "step": 836 }, { "epoch": 0.2386655260906758, "grad_norm": 2.0411720213460347, "learning_rate": 8.902772829202015e-06, "loss": 0.0561, "step": 837 }, { "epoch": 0.23895067008839463, "grad_norm": 0.4962525894662451, "learning_rate": 8.899884115497053e-06, "loss": 0.0089, "step": 838 }, { "epoch": 0.2392358140861135, "grad_norm": 5.584077909965726, "learning_rate": 8.896992074138171e-06, "loss": 0.0478, "step": 839 }, { "epoch": 0.23952095808383234, "grad_norm": 1.8286855246809188, "learning_rate": 8.89409670759306e-06, "loss": 0.0356, "step": 840 }, { "epoch": 0.23980610208155118, "grad_norm": 0.7242311973923197, "learning_rate": 8.89119801833225e-06, "loss": 0.0196, "step": 841 }, { "epoch": 0.24009124607927004, "grad_norm": 1.6524840292900185, "learning_rate": 8.888296008829106e-06, "loss": 0.0313, "step": 842 }, { "epoch": 0.24037639007698888, "grad_norm": 1.6025270226398667, "learning_rate": 8.885390681559822e-06, "loss": 0.0589, "step": 843 }, { "epoch": 0.24066153407470772, "grad_norm": 0.6828303452986196, "learning_rate": 8.88248203900343e-06, "loss": 0.0129, "step": 844 }, { "epoch": 0.24094667807242656, "grad_norm": 1.0847428212810786, "learning_rate": 8.879570083641788e-06, "loss": 0.0169, "step": 845 }, { "epoch": 0.24123182207014543, "grad_norm": 1.3682289401821128, "learning_rate": 8.87665481795958e-06, "loss": 0.0253, "step": 846 }, { "epoch": 0.24151696606786427, "grad_norm": 3.7461358288629434, "learning_rate": 8.873736244444311e-06, "loss": 0.0483, "step": 847 }, { "epoch": 0.2418021100655831, "grad_norm": 1.9237644513844145, "learning_rate": 8.870814365586315e-06, "loss": 0.0205, "step": 848 }, { "epoch": 0.24208725406330198, "grad_norm": 1.2152994567992417, "learning_rate": 8.867889183878742e-06, "loss": 0.0268, "step": 849 }, { "epoch": 0.24237239806102082, "grad_norm": 1.91329866781686, "learning_rate": 8.864960701817564e-06, "loss": 0.0178, "step": 850 }, { "epoch": 0.24265754205873966, "grad_norm": 1.9431409917366391, "learning_rate": 8.862028921901563e-06, "loss": 0.033, "step": 851 }, { "epoch": 0.24294268605645852, "grad_norm": 1.2283113327719135, "learning_rate": 8.859093846632343e-06, "loss": 0.0295, "step": 852 }, { "epoch": 0.24322783005417736, "grad_norm": 0.9973988056561106, "learning_rate": 8.856155478514313e-06, "loss": 0.0212, "step": 853 }, { "epoch": 0.2435129740518962, "grad_norm": 2.4421795168152753, "learning_rate": 8.853213820054693e-06, "loss": 0.0172, "step": 854 }, { "epoch": 0.24379811804961504, "grad_norm": 0.6853186509231574, "learning_rate": 8.850268873763514e-06, "loss": 0.0071, "step": 855 }, { "epoch": 0.2440832620473339, "grad_norm": 0.20714634708932925, "learning_rate": 8.84732064215361e-06, "loss": 0.0081, "step": 856 }, { "epoch": 0.24436840604505275, "grad_norm": 1.4038906766225647, "learning_rate": 8.844369127740617e-06, "loss": 0.0591, "step": 857 }, { "epoch": 0.2446535500427716, "grad_norm": 3.2053223620330944, "learning_rate": 8.841414333042975e-06, "loss": 0.0932, "step": 858 }, { "epoch": 0.24493869404049046, "grad_norm": 0.7632677420756766, "learning_rate": 8.83845626058192e-06, "loss": 0.016, "step": 859 }, { "epoch": 0.2452238380382093, "grad_norm": 1.4585990037580068, "learning_rate": 8.835494912881487e-06, "loss": 0.0264, "step": 860 }, { "epoch": 0.24550898203592814, "grad_norm": 0.9392293367012389, "learning_rate": 8.832530292468509e-06, "loss": 0.0176, "step": 861 }, { "epoch": 0.245794126033647, "grad_norm": 0.9313249492245359, "learning_rate": 8.8295624018726e-06, "loss": 0.0239, "step": 862 }, { "epoch": 0.24607927003136584, "grad_norm": 1.8503666333746411, "learning_rate": 8.826591243626178e-06, "loss": 0.0468, "step": 863 }, { "epoch": 0.24636441402908468, "grad_norm": 1.5599510249156554, "learning_rate": 8.82361682026444e-06, "loss": 0.0466, "step": 864 }, { "epoch": 0.24664955802680352, "grad_norm": 0.7932008240844486, "learning_rate": 8.820639134325371e-06, "loss": 0.0174, "step": 865 }, { "epoch": 0.2469347020245224, "grad_norm": 1.7519462558310501, "learning_rate": 8.817658188349745e-06, "loss": 0.0308, "step": 866 }, { "epoch": 0.24721984602224123, "grad_norm": 0.7374567291630424, "learning_rate": 8.81467398488111e-06, "loss": 0.0268, "step": 867 }, { "epoch": 0.24750499001996007, "grad_norm": 2.2486150692979185, "learning_rate": 8.811686526465799e-06, "loss": 0.0364, "step": 868 }, { "epoch": 0.24779013401767894, "grad_norm": 1.743439371955327, "learning_rate": 8.808695815652922e-06, "loss": 0.0414, "step": 869 }, { "epoch": 0.24807527801539778, "grad_norm": 1.0726966014600394, "learning_rate": 8.805701854994358e-06, "loss": 0.0208, "step": 870 }, { "epoch": 0.24836042201311662, "grad_norm": 2.634942103615967, "learning_rate": 8.802704647044766e-06, "loss": 0.0335, "step": 871 }, { "epoch": 0.24864556601083548, "grad_norm": 1.2580046690380893, "learning_rate": 8.799704194361575e-06, "loss": 0.0253, "step": 872 }, { "epoch": 0.24893071000855432, "grad_norm": 1.096234047219748, "learning_rate": 8.79670049950498e-06, "loss": 0.0328, "step": 873 }, { "epoch": 0.24921585400627316, "grad_norm": 1.547027261492061, "learning_rate": 8.793693565037942e-06, "loss": 0.034, "step": 874 }, { "epoch": 0.249500998003992, "grad_norm": 0.5892064156881954, "learning_rate": 8.790683393526192e-06, "loss": 0.0164, "step": 875 }, { "epoch": 0.24978614200171087, "grad_norm": 1.3901316717407268, "learning_rate": 8.787669987538214e-06, "loss": 0.0405, "step": 876 }, { "epoch": 0.25007128599942974, "grad_norm": 1.7467544871345946, "learning_rate": 8.784653349645259e-06, "loss": 0.0464, "step": 877 }, { "epoch": 0.2503564299971486, "grad_norm": 0.8752141247160915, "learning_rate": 8.781633482421333e-06, "loss": 0.0186, "step": 878 }, { "epoch": 0.2506415739948674, "grad_norm": 2.4276853957481612, "learning_rate": 8.7786103884432e-06, "loss": 0.0584, "step": 879 }, { "epoch": 0.25092671799258626, "grad_norm": 1.3072797801662137, "learning_rate": 8.775584070290373e-06, "loss": 0.0283, "step": 880 }, { "epoch": 0.2512118619903051, "grad_norm": 0.676361478133077, "learning_rate": 8.772554530545118e-06, "loss": 0.0127, "step": 881 }, { "epoch": 0.25149700598802394, "grad_norm": 2.1577548832377618, "learning_rate": 8.769521771792453e-06, "loss": 0.0389, "step": 882 }, { "epoch": 0.2517821499857428, "grad_norm": 2.3856943630410496, "learning_rate": 8.766485796620135e-06, "loss": 0.0634, "step": 883 }, { "epoch": 0.25206729398346167, "grad_norm": 1.0305070102089764, "learning_rate": 8.763446607618675e-06, "loss": 0.0202, "step": 884 }, { "epoch": 0.2523524379811805, "grad_norm": 1.2830717231149895, "learning_rate": 8.76040420738132e-06, "loss": 0.0284, "step": 885 }, { "epoch": 0.25263758197889935, "grad_norm": 1.6411073612068923, "learning_rate": 8.75735859850406e-06, "loss": 0.0219, "step": 886 }, { "epoch": 0.2529227259766182, "grad_norm": 1.1756235902210506, "learning_rate": 8.754309783585619e-06, "loss": 0.0349, "step": 887 }, { "epoch": 0.25320786997433703, "grad_norm": 0.6847813278483827, "learning_rate": 8.751257765227462e-06, "loss": 0.0206, "step": 888 }, { "epoch": 0.25349301397205587, "grad_norm": 0.5892100476513646, "learning_rate": 8.748202546033781e-06, "loss": 0.0202, "step": 889 }, { "epoch": 0.2537781579697747, "grad_norm": 0.6949316617452543, "learning_rate": 8.745144128611506e-06, "loss": 0.021, "step": 890 }, { "epoch": 0.2540633019674936, "grad_norm": 0.9729222569070698, "learning_rate": 8.742082515570291e-06, "loss": 0.0194, "step": 891 }, { "epoch": 0.25434844596521244, "grad_norm": 0.45568034642641875, "learning_rate": 8.739017709522519e-06, "loss": 0.0106, "step": 892 }, { "epoch": 0.2546335899629313, "grad_norm": 1.2410486423220612, "learning_rate": 8.735949713083295e-06, "loss": 0.0118, "step": 893 }, { "epoch": 0.2549187339606501, "grad_norm": 1.9597718656925054, "learning_rate": 8.732878528870452e-06, "loss": 0.0271, "step": 894 }, { "epoch": 0.25520387795836896, "grad_norm": 1.7196417456122466, "learning_rate": 8.729804159504537e-06, "loss": 0.0498, "step": 895 }, { "epoch": 0.2554890219560878, "grad_norm": 0.9101465781142756, "learning_rate": 8.726726607608817e-06, "loss": 0.009, "step": 896 }, { "epoch": 0.2557741659538067, "grad_norm": 1.5268690830009926, "learning_rate": 8.723645875809274e-06, "loss": 0.0311, "step": 897 }, { "epoch": 0.25605930995152554, "grad_norm": 1.2252517246259709, "learning_rate": 8.720561966734604e-06, "loss": 0.0102, "step": 898 }, { "epoch": 0.2563444539492444, "grad_norm": 1.2280625351519197, "learning_rate": 8.717474883016214e-06, "loss": 0.0085, "step": 899 }, { "epoch": 0.2566295979469632, "grad_norm": 3.117132439752605, "learning_rate": 8.714384627288222e-06, "loss": 0.0791, "step": 900 }, { "epoch": 0.25691474194468206, "grad_norm": 1.4949161090600118, "learning_rate": 8.711291202187447e-06, "loss": 0.0322, "step": 901 }, { "epoch": 0.2571998859424009, "grad_norm": 1.8302629323751376, "learning_rate": 8.708194610353418e-06, "loss": 0.0617, "step": 902 }, { "epoch": 0.25748502994011974, "grad_norm": 2.308044393671751, "learning_rate": 8.705094854428362e-06, "loss": 0.0443, "step": 903 }, { "epoch": 0.25777017393783863, "grad_norm": 1.6208323890738643, "learning_rate": 8.701991937057211e-06, "loss": 0.0229, "step": 904 }, { "epoch": 0.25805531793555747, "grad_norm": 0.9063595773842802, "learning_rate": 8.698885860887587e-06, "loss": 0.0076, "step": 905 }, { "epoch": 0.2583404619332763, "grad_norm": 0.7271393808586717, "learning_rate": 8.695776628569813e-06, "loss": 0.0305, "step": 906 }, { "epoch": 0.25862560593099515, "grad_norm": 1.9625970380708422, "learning_rate": 8.692664242756902e-06, "loss": 0.0415, "step": 907 }, { "epoch": 0.258910749928714, "grad_norm": 0.5666328525418538, "learning_rate": 8.689548706104564e-06, "loss": 0.0159, "step": 908 }, { "epoch": 0.25919589392643283, "grad_norm": 2.8145375900053806, "learning_rate": 8.68643002127119e-06, "loss": 0.0574, "step": 909 }, { "epoch": 0.25948103792415167, "grad_norm": 1.5275888603420062, "learning_rate": 8.683308190917857e-06, "loss": 0.0341, "step": 910 }, { "epoch": 0.25976618192187056, "grad_norm": 3.1426880240693214, "learning_rate": 8.680183217708334e-06, "loss": 0.0549, "step": 911 }, { "epoch": 0.2600513259195894, "grad_norm": 0.7773518110968769, "learning_rate": 8.677055104309062e-06, "loss": 0.0145, "step": 912 }, { "epoch": 0.26033646991730824, "grad_norm": 1.9823459424727548, "learning_rate": 8.673923853389172e-06, "loss": 0.0437, "step": 913 }, { "epoch": 0.2606216139150271, "grad_norm": 1.1164470316091455, "learning_rate": 8.670789467620461e-06, "loss": 0.0231, "step": 914 }, { "epoch": 0.2609067579127459, "grad_norm": 0.6832591626715077, "learning_rate": 8.667651949677409e-06, "loss": 0.0163, "step": 915 }, { "epoch": 0.26119190191046476, "grad_norm": 1.876547933242045, "learning_rate": 8.664511302237164e-06, "loss": 0.0279, "step": 916 }, { "epoch": 0.26147704590818366, "grad_norm": 1.152597026479379, "learning_rate": 8.661367527979547e-06, "loss": 0.0161, "step": 917 }, { "epoch": 0.2617621899059025, "grad_norm": 1.9164628134925323, "learning_rate": 8.658220629587046e-06, "loss": 0.0457, "step": 918 }, { "epoch": 0.26204733390362134, "grad_norm": 1.5725610393465852, "learning_rate": 8.655070609744816e-06, "loss": 0.0271, "step": 919 }, { "epoch": 0.2623324779013402, "grad_norm": 1.6679600131417485, "learning_rate": 8.651917471140673e-06, "loss": 0.0562, "step": 920 }, { "epoch": 0.262617621899059, "grad_norm": 1.7758731689450964, "learning_rate": 8.648761216465096e-06, "loss": 0.0299, "step": 921 }, { "epoch": 0.26290276589677786, "grad_norm": 1.3868053732760177, "learning_rate": 8.64560184841122e-06, "loss": 0.0178, "step": 922 }, { "epoch": 0.2631879098944967, "grad_norm": 1.9904157646953466, "learning_rate": 8.642439369674845e-06, "loss": 0.0239, "step": 923 }, { "epoch": 0.2634730538922156, "grad_norm": 1.7623169430202168, "learning_rate": 8.639273782954412e-06, "loss": 0.0263, "step": 924 }, { "epoch": 0.26375819788993443, "grad_norm": 1.0216670315613425, "learning_rate": 8.636105090951022e-06, "loss": 0.0286, "step": 925 }, { "epoch": 0.26404334188765327, "grad_norm": 1.0887871495489123, "learning_rate": 8.63293329636843e-06, "loss": 0.0395, "step": 926 }, { "epoch": 0.2643284858853721, "grad_norm": 1.0890011420632484, "learning_rate": 8.629758401913027e-06, "loss": 0.0269, "step": 927 }, { "epoch": 0.26461362988309095, "grad_norm": 1.0417579612272687, "learning_rate": 8.626580410293859e-06, "loss": 0.0133, "step": 928 }, { "epoch": 0.2648987738808098, "grad_norm": 1.236301382418515, "learning_rate": 8.623399324222608e-06, "loss": 0.0164, "step": 929 }, { "epoch": 0.26518391787852863, "grad_norm": 1.4547427099574102, "learning_rate": 8.620215146413603e-06, "loss": 0.0193, "step": 930 }, { "epoch": 0.2654690618762475, "grad_norm": 0.9108297216575995, "learning_rate": 8.617027879583801e-06, "loss": 0.0211, "step": 931 }, { "epoch": 0.26575420587396636, "grad_norm": 1.0697944180054113, "learning_rate": 8.613837526452806e-06, "loss": 0.0295, "step": 932 }, { "epoch": 0.2660393498716852, "grad_norm": 0.6751528441766895, "learning_rate": 8.61064408974285e-06, "loss": 0.0156, "step": 933 }, { "epoch": 0.26632449386940404, "grad_norm": 1.8490910731349444, "learning_rate": 8.607447572178796e-06, "loss": 0.0642, "step": 934 }, { "epoch": 0.2666096378671229, "grad_norm": 0.7855324683250933, "learning_rate": 8.604247976488137e-06, "loss": 0.0292, "step": 935 }, { "epoch": 0.2668947818648417, "grad_norm": 1.731820383728147, "learning_rate": 8.601045305400988e-06, "loss": 0.0373, "step": 936 }, { "epoch": 0.2671799258625606, "grad_norm": 2.1895639341457716, "learning_rate": 8.597839561650096e-06, "loss": 0.0577, "step": 937 }, { "epoch": 0.26746506986027946, "grad_norm": 0.891986333285142, "learning_rate": 8.594630747970824e-06, "loss": 0.0315, "step": 938 }, { "epoch": 0.2677502138579983, "grad_norm": 0.6734024951645515, "learning_rate": 8.591418867101158e-06, "loss": 0.0169, "step": 939 }, { "epoch": 0.26803535785571714, "grad_norm": 0.7790384279219437, "learning_rate": 8.588203921781699e-06, "loss": 0.024, "step": 940 }, { "epoch": 0.268320501853436, "grad_norm": 1.8442659646304564, "learning_rate": 8.584985914755663e-06, "loss": 0.0476, "step": 941 }, { "epoch": 0.2686056458511548, "grad_norm": 1.1596418180170927, "learning_rate": 8.581764848768878e-06, "loss": 0.0444, "step": 942 }, { "epoch": 0.26889078984887366, "grad_norm": 1.4812227170866508, "learning_rate": 8.578540726569782e-06, "loss": 0.0379, "step": 943 }, { "epoch": 0.26917593384659255, "grad_norm": 0.778765028574822, "learning_rate": 8.575313550909424e-06, "loss": 0.017, "step": 944 }, { "epoch": 0.2694610778443114, "grad_norm": 0.6214418267264455, "learning_rate": 8.572083324541454e-06, "loss": 0.0193, "step": 945 }, { "epoch": 0.26974622184203023, "grad_norm": 0.5537904546033956, "learning_rate": 8.568850050222129e-06, "loss": 0.0156, "step": 946 }, { "epoch": 0.27003136583974907, "grad_norm": 1.9665330817864057, "learning_rate": 8.565613730710303e-06, "loss": 0.0363, "step": 947 }, { "epoch": 0.2703165098374679, "grad_norm": 1.7179405656998434, "learning_rate": 8.56237436876743e-06, "loss": 0.0282, "step": 948 }, { "epoch": 0.27060165383518675, "grad_norm": 1.591795064619829, "learning_rate": 8.559131967157561e-06, "loss": 0.0381, "step": 949 }, { "epoch": 0.2708867978329056, "grad_norm": 1.4855162991515705, "learning_rate": 8.55588652864734e-06, "loss": 0.0319, "step": 950 }, { "epoch": 0.2711719418306245, "grad_norm": 1.23660525088581, "learning_rate": 8.552638056006004e-06, "loss": 0.0211, "step": 951 }, { "epoch": 0.2714570858283433, "grad_norm": 1.3975270889745315, "learning_rate": 8.549386552005375e-06, "loss": 0.0134, "step": 952 }, { "epoch": 0.27174222982606216, "grad_norm": 1.9571395698904834, "learning_rate": 8.546132019419862e-06, "loss": 0.0248, "step": 953 }, { "epoch": 0.272027373823781, "grad_norm": 2.1677184607215527, "learning_rate": 8.542874461026462e-06, "loss": 0.0502, "step": 954 }, { "epoch": 0.27231251782149984, "grad_norm": 1.699300392038293, "learning_rate": 8.539613879604751e-06, "loss": 0.049, "step": 955 }, { "epoch": 0.2725976618192187, "grad_norm": 1.5110559904082717, "learning_rate": 8.536350277936887e-06, "loss": 0.0361, "step": 956 }, { "epoch": 0.2728828058169376, "grad_norm": 1.694634659292682, "learning_rate": 8.533083658807601e-06, "loss": 0.0602, "step": 957 }, { "epoch": 0.2731679498146564, "grad_norm": 0.6948943012372113, "learning_rate": 8.529814025004202e-06, "loss": 0.0208, "step": 958 }, { "epoch": 0.27345309381237526, "grad_norm": 1.2290435954640724, "learning_rate": 8.526541379316569e-06, "loss": 0.037, "step": 959 }, { "epoch": 0.2737382378100941, "grad_norm": 0.9917533429856523, "learning_rate": 8.523265724537153e-06, "loss": 0.0326, "step": 960 }, { "epoch": 0.27402338180781294, "grad_norm": 0.5603987383374495, "learning_rate": 8.519987063460973e-06, "loss": 0.0135, "step": 961 }, { "epoch": 0.2743085258055318, "grad_norm": 0.7544090611895435, "learning_rate": 8.51670539888561e-06, "loss": 0.0346, "step": 962 }, { "epoch": 0.2745936698032506, "grad_norm": 1.2787254933712948, "learning_rate": 8.513420733611212e-06, "loss": 0.0456, "step": 963 }, { "epoch": 0.2748788138009695, "grad_norm": 1.186340995532642, "learning_rate": 8.510133070440483e-06, "loss": 0.023, "step": 964 }, { "epoch": 0.27516395779868835, "grad_norm": 1.1520864734843645, "learning_rate": 8.506842412178688e-06, "loss": 0.0204, "step": 965 }, { "epoch": 0.2754491017964072, "grad_norm": 1.3801482600792678, "learning_rate": 8.503548761633646e-06, "loss": 0.0513, "step": 966 }, { "epoch": 0.27573424579412603, "grad_norm": 1.4900548822914077, "learning_rate": 8.500252121615733e-06, "loss": 0.0357, "step": 967 }, { "epoch": 0.27601938979184487, "grad_norm": 1.3136520745656428, "learning_rate": 8.496952494937869e-06, "loss": 0.0287, "step": 968 }, { "epoch": 0.2763045337895637, "grad_norm": 1.1985944568873772, "learning_rate": 8.493649884415529e-06, "loss": 0.0288, "step": 969 }, { "epoch": 0.27658967778728255, "grad_norm": 1.3680126216074568, "learning_rate": 8.490344292866728e-06, "loss": 0.042, "step": 970 }, { "epoch": 0.27687482178500145, "grad_norm": 1.3159870085955017, "learning_rate": 8.487035723112033e-06, "loss": 0.0205, "step": 971 }, { "epoch": 0.2771599657827203, "grad_norm": 1.0078868077879315, "learning_rate": 8.483724177974543e-06, "loss": 0.0172, "step": 972 }, { "epoch": 0.2774451097804391, "grad_norm": 1.1647342396721414, "learning_rate": 8.480409660279903e-06, "loss": 0.0187, "step": 973 }, { "epoch": 0.27773025377815796, "grad_norm": 2.31581097184704, "learning_rate": 8.477092172856287e-06, "loss": 0.0386, "step": 974 }, { "epoch": 0.2780153977758768, "grad_norm": 2.810835578979634, "learning_rate": 8.473771718534411e-06, "loss": 0.0757, "step": 975 }, { "epoch": 0.27830054177359564, "grad_norm": 2.79707883978544, "learning_rate": 8.47044830014752e-06, "loss": 0.0477, "step": 976 }, { "epoch": 0.27858568577131454, "grad_norm": 0.9667581419870983, "learning_rate": 8.467121920531383e-06, "loss": 0.0242, "step": 977 }, { "epoch": 0.2788708297690334, "grad_norm": 0.6681742594213469, "learning_rate": 8.463792582524302e-06, "loss": 0.0096, "step": 978 }, { "epoch": 0.2791559737667522, "grad_norm": 1.4545154240977922, "learning_rate": 8.460460288967101e-06, "loss": 0.0217, "step": 979 }, { "epoch": 0.27944111776447106, "grad_norm": 0.3637082821635198, "learning_rate": 8.457125042703124e-06, "loss": 0.0067, "step": 980 }, { "epoch": 0.2797262617621899, "grad_norm": 1.7754271211354633, "learning_rate": 8.45378684657824e-06, "loss": 0.0622, "step": 981 }, { "epoch": 0.28001140575990874, "grad_norm": 0.512206035741963, "learning_rate": 8.45044570344083e-06, "loss": 0.0079, "step": 982 }, { "epoch": 0.2802965497576276, "grad_norm": 0.287836877286698, "learning_rate": 8.44710161614179e-06, "loss": 0.0067, "step": 983 }, { "epoch": 0.2805816937553465, "grad_norm": 0.5981293412787716, "learning_rate": 8.443754587534529e-06, "loss": 0.0182, "step": 984 }, { "epoch": 0.2808668377530653, "grad_norm": 1.5761429513855125, "learning_rate": 8.440404620474967e-06, "loss": 0.0237, "step": 985 }, { "epoch": 0.28115198175078415, "grad_norm": 1.3666585287674748, "learning_rate": 8.43705171782153e-06, "loss": 0.0425, "step": 986 }, { "epoch": 0.281437125748503, "grad_norm": 1.697391675178744, "learning_rate": 8.43369588243515e-06, "loss": 0.0236, "step": 987 }, { "epoch": 0.28172226974622183, "grad_norm": 3.0482492895545796, "learning_rate": 8.430337117179259e-06, "loss": 0.0409, "step": 988 }, { "epoch": 0.28200741374394067, "grad_norm": 0.7229976500220648, "learning_rate": 8.426975424919791e-06, "loss": 0.01, "step": 989 }, { "epoch": 0.2822925577416595, "grad_norm": 1.4158032540007655, "learning_rate": 8.423610808525177e-06, "loss": 0.0186, "step": 990 }, { "epoch": 0.2825777017393784, "grad_norm": 1.6615373983383692, "learning_rate": 8.420243270866343e-06, "loss": 0.0652, "step": 991 }, { "epoch": 0.28286284573709725, "grad_norm": 2.307370170129627, "learning_rate": 8.416872814816707e-06, "loss": 0.047, "step": 992 }, { "epoch": 0.2831479897348161, "grad_norm": 1.8461913330119348, "learning_rate": 8.41349944325218e-06, "loss": 0.0465, "step": 993 }, { "epoch": 0.2834331337325349, "grad_norm": 1.8568994890741377, "learning_rate": 8.410123159051155e-06, "loss": 0.0205, "step": 994 }, { "epoch": 0.28371827773025377, "grad_norm": 1.2212193684268584, "learning_rate": 8.40674396509452e-06, "loss": 0.0218, "step": 995 }, { "epoch": 0.2840034217279726, "grad_norm": 1.1766960236291593, "learning_rate": 8.40336186426563e-06, "loss": 0.0457, "step": 996 }, { "epoch": 0.2842885657256915, "grad_norm": 1.1251722175740966, "learning_rate": 8.39997685945034e-06, "loss": 0.0216, "step": 997 }, { "epoch": 0.28457370972341034, "grad_norm": 2.3908402551223786, "learning_rate": 8.396588953536968e-06, "loss": 0.0403, "step": 998 }, { "epoch": 0.2848588537211292, "grad_norm": 2.331842901417062, "learning_rate": 8.393198149416311e-06, "loss": 0.0701, "step": 999 }, { "epoch": 0.285143997718848, "grad_norm": 1.9563319254857874, "learning_rate": 8.389804449981645e-06, "loss": 0.0171, "step": 1000 }, { "epoch": 0.28542914171656686, "grad_norm": 1.5049554471669486, "learning_rate": 8.386407858128707e-06, "loss": 0.0368, "step": 1001 }, { "epoch": 0.2857142857142857, "grad_norm": 2.3620183376762647, "learning_rate": 8.383008376755707e-06, "loss": 0.0298, "step": 1002 }, { "epoch": 0.28599942971200454, "grad_norm": 1.5127708826099804, "learning_rate": 8.379606008763325e-06, "loss": 0.0384, "step": 1003 }, { "epoch": 0.28628457370972343, "grad_norm": 1.9077680147517553, "learning_rate": 8.376200757054695e-06, "loss": 0.0313, "step": 1004 }, { "epoch": 0.2865697177074423, "grad_norm": 1.3689906306129866, "learning_rate": 8.372792624535417e-06, "loss": 0.0321, "step": 1005 }, { "epoch": 0.2868548617051611, "grad_norm": 1.1238958908515264, "learning_rate": 8.369381614113547e-06, "loss": 0.014, "step": 1006 }, { "epoch": 0.28714000570287995, "grad_norm": 1.2072815121718892, "learning_rate": 8.365967728699602e-06, "loss": 0.0179, "step": 1007 }, { "epoch": 0.2874251497005988, "grad_norm": 0.4667146551541128, "learning_rate": 8.362550971206543e-06, "loss": 0.0146, "step": 1008 }, { "epoch": 0.28771029369831763, "grad_norm": 1.6836956764062203, "learning_rate": 8.359131344549788e-06, "loss": 0.0299, "step": 1009 }, { "epoch": 0.28799543769603647, "grad_norm": 0.8142984273923918, "learning_rate": 8.355708851647202e-06, "loss": 0.0226, "step": 1010 }, { "epoch": 0.28828058169375537, "grad_norm": 1.6389536841060004, "learning_rate": 8.352283495419096e-06, "loss": 0.0308, "step": 1011 }, { "epoch": 0.2885657256914742, "grad_norm": 1.5444478808693596, "learning_rate": 8.348855278788224e-06, "loss": 0.0589, "step": 1012 }, { "epoch": 0.28885086968919305, "grad_norm": 0.7259294284234575, "learning_rate": 8.345424204679778e-06, "loss": 0.0191, "step": 1013 }, { "epoch": 0.2891360136869119, "grad_norm": 0.6088052137967528, "learning_rate": 8.34199027602139e-06, "loss": 0.0158, "step": 1014 }, { "epoch": 0.2894211576846307, "grad_norm": 0.6287296236873937, "learning_rate": 8.338553495743132e-06, "loss": 0.0218, "step": 1015 }, { "epoch": 0.28970630168234957, "grad_norm": 1.7658297184074514, "learning_rate": 8.335113866777502e-06, "loss": 0.0441, "step": 1016 }, { "epoch": 0.28999144568006846, "grad_norm": 0.5235221980156642, "learning_rate": 8.331671392059433e-06, "loss": 0.0166, "step": 1017 }, { "epoch": 0.2902765896777873, "grad_norm": 0.8033872378713534, "learning_rate": 8.328226074526284e-06, "loss": 0.0292, "step": 1018 }, { "epoch": 0.29056173367550614, "grad_norm": 1.568699813214378, "learning_rate": 8.324777917117843e-06, "loss": 0.029, "step": 1019 }, { "epoch": 0.290846877673225, "grad_norm": 0.8052329509177221, "learning_rate": 8.321326922776321e-06, "loss": 0.0231, "step": 1020 }, { "epoch": 0.2911320216709438, "grad_norm": 0.5269396898380581, "learning_rate": 8.31787309444634e-06, "loss": 0.0174, "step": 1021 }, { "epoch": 0.29141716566866266, "grad_norm": 1.6205080390020445, "learning_rate": 8.314416435074956e-06, "loss": 0.0197, "step": 1022 }, { "epoch": 0.2917023096663815, "grad_norm": 1.666001562463425, "learning_rate": 8.31095694761163e-06, "loss": 0.019, "step": 1023 }, { "epoch": 0.2919874536641004, "grad_norm": 1.835698903031861, "learning_rate": 8.307494635008237e-06, "loss": 0.0214, "step": 1024 }, { "epoch": 0.29227259766181923, "grad_norm": 0.8037987767198944, "learning_rate": 8.304029500219064e-06, "loss": 0.0168, "step": 1025 }, { "epoch": 0.2925577416595381, "grad_norm": 2.370065976232439, "learning_rate": 8.300561546200812e-06, "loss": 0.0247, "step": 1026 }, { "epoch": 0.2928428856572569, "grad_norm": 1.5504716392290512, "learning_rate": 8.297090775912574e-06, "loss": 0.0391, "step": 1027 }, { "epoch": 0.29312802965497575, "grad_norm": 1.2554327251719692, "learning_rate": 8.293617192315859e-06, "loss": 0.0517, "step": 1028 }, { "epoch": 0.2934131736526946, "grad_norm": 0.790459152689314, "learning_rate": 8.29014079837457e-06, "loss": 0.0119, "step": 1029 }, { "epoch": 0.29369831765041343, "grad_norm": 1.4149648173860672, "learning_rate": 8.28666159705501e-06, "loss": 0.0133, "step": 1030 }, { "epoch": 0.2939834616481323, "grad_norm": 0.9973359366143715, "learning_rate": 8.283179591325879e-06, "loss": 0.013, "step": 1031 }, { "epoch": 0.29426860564585117, "grad_norm": 0.36926897766983774, "learning_rate": 8.279694784158262e-06, "loss": 0.008, "step": 1032 }, { "epoch": 0.29455374964357, "grad_norm": 0.9623976721838673, "learning_rate": 8.276207178525646e-06, "loss": 0.0196, "step": 1033 }, { "epoch": 0.29483889364128885, "grad_norm": 1.4554871378304686, "learning_rate": 8.272716777403898e-06, "loss": 0.041, "step": 1034 }, { "epoch": 0.2951240376390077, "grad_norm": 2.6162190558641463, "learning_rate": 8.26922358377127e-06, "loss": 0.0745, "step": 1035 }, { "epoch": 0.2954091816367265, "grad_norm": 0.8422156448296566, "learning_rate": 8.265727600608401e-06, "loss": 0.0194, "step": 1036 }, { "epoch": 0.2956943256344454, "grad_norm": 1.0120407410840147, "learning_rate": 8.262228830898313e-06, "loss": 0.0387, "step": 1037 }, { "epoch": 0.29597946963216426, "grad_norm": 0.7723311949173021, "learning_rate": 8.258727277626394e-06, "loss": 0.0102, "step": 1038 }, { "epoch": 0.2962646136298831, "grad_norm": 0.9132982937599546, "learning_rate": 8.255222943780419e-06, "loss": 0.0236, "step": 1039 }, { "epoch": 0.29654975762760194, "grad_norm": 1.4697397386034843, "learning_rate": 8.251715832350526e-06, "loss": 0.0406, "step": 1040 }, { "epoch": 0.2968349016253208, "grad_norm": 1.3125795625460037, "learning_rate": 8.248205946329233e-06, "loss": 0.0249, "step": 1041 }, { "epoch": 0.2971200456230396, "grad_norm": 2.0022314201473765, "learning_rate": 8.244693288711416e-06, "loss": 0.0389, "step": 1042 }, { "epoch": 0.29740518962075846, "grad_norm": 0.9233574909374214, "learning_rate": 8.241177862494323e-06, "loss": 0.0177, "step": 1043 }, { "epoch": 0.29769033361847735, "grad_norm": 1.118407928249881, "learning_rate": 8.23765967067756e-06, "loss": 0.0504, "step": 1044 }, { "epoch": 0.2979754776161962, "grad_norm": 2.874005101086032, "learning_rate": 8.234138716263095e-06, "loss": 0.0629, "step": 1045 }, { "epoch": 0.29826062161391503, "grad_norm": 1.7249213253113764, "learning_rate": 8.230615002255254e-06, "loss": 0.039, "step": 1046 }, { "epoch": 0.2985457656116339, "grad_norm": 1.282268931205406, "learning_rate": 8.227088531660712e-06, "loss": 0.0313, "step": 1047 }, { "epoch": 0.2988309096093527, "grad_norm": 1.5416921550475424, "learning_rate": 8.223559307488506e-06, "loss": 0.0517, "step": 1048 }, { "epoch": 0.29911605360707155, "grad_norm": 0.9912931776780044, "learning_rate": 8.220027332750012e-06, "loss": 0.0316, "step": 1049 }, { "epoch": 0.2994011976047904, "grad_norm": 0.6255964251801459, "learning_rate": 8.21649261045896e-06, "loss": 0.0218, "step": 1050 }, { "epoch": 0.2996863416025093, "grad_norm": 1.0299927686497676, "learning_rate": 8.212955143631425e-06, "loss": 0.0246, "step": 1051 }, { "epoch": 0.2999714856002281, "grad_norm": 1.5809553916222134, "learning_rate": 8.209414935285816e-06, "loss": 0.036, "step": 1052 }, { "epoch": 0.30025662959794697, "grad_norm": 1.0729865806566057, "learning_rate": 8.20587198844289e-06, "loss": 0.0227, "step": 1053 }, { "epoch": 0.3005417735956658, "grad_norm": 2.1564117236579454, "learning_rate": 8.202326306125736e-06, "loss": 0.0314, "step": 1054 }, { "epoch": 0.30082691759338465, "grad_norm": 0.9502094736160007, "learning_rate": 8.198777891359778e-06, "loss": 0.0248, "step": 1055 }, { "epoch": 0.3011120615911035, "grad_norm": 0.9962396374644391, "learning_rate": 8.19522674717277e-06, "loss": 0.041, "step": 1056 }, { "epoch": 0.3013972055888224, "grad_norm": 0.6369816183985646, "learning_rate": 8.1916728765948e-06, "loss": 0.0237, "step": 1057 }, { "epoch": 0.3016823495865412, "grad_norm": 2.5117579204688623, "learning_rate": 8.188116282658278e-06, "loss": 0.0359, "step": 1058 }, { "epoch": 0.30196749358426006, "grad_norm": 1.6764540415557843, "learning_rate": 8.184556968397938e-06, "loss": 0.0259, "step": 1059 }, { "epoch": 0.3022526375819789, "grad_norm": 0.9996801308305837, "learning_rate": 8.180994936850834e-06, "loss": 0.0242, "step": 1060 }, { "epoch": 0.30253778157969774, "grad_norm": 0.7595100913126963, "learning_rate": 8.177430191056346e-06, "loss": 0.0173, "step": 1061 }, { "epoch": 0.3028229255774166, "grad_norm": 0.9654704679405917, "learning_rate": 8.173862734056158e-06, "loss": 0.0277, "step": 1062 }, { "epoch": 0.3031080695751354, "grad_norm": 1.5038824626171603, "learning_rate": 8.170292568894278e-06, "loss": 0.0237, "step": 1063 }, { "epoch": 0.3033932135728543, "grad_norm": 0.7973443754741695, "learning_rate": 8.16671969861702e-06, "loss": 0.027, "step": 1064 }, { "epoch": 0.30367835757057315, "grad_norm": 1.8187366967593017, "learning_rate": 8.163144126273004e-06, "loss": 0.025, "step": 1065 }, { "epoch": 0.303963501568292, "grad_norm": 0.9250825610118477, "learning_rate": 8.159565854913162e-06, "loss": 0.0175, "step": 1066 }, { "epoch": 0.30424864556601083, "grad_norm": 1.506560630883658, "learning_rate": 8.155984887590724e-06, "loss": 0.0332, "step": 1067 }, { "epoch": 0.3045337895637297, "grad_norm": 1.2512981744236478, "learning_rate": 8.152401227361224e-06, "loss": 0.0369, "step": 1068 }, { "epoch": 0.3048189335614485, "grad_norm": 1.8833596709136027, "learning_rate": 8.148814877282487e-06, "loss": 0.0292, "step": 1069 }, { "epoch": 0.30510407755916735, "grad_norm": 0.7881838063455426, "learning_rate": 8.145225840414641e-06, "loss": 0.0101, "step": 1070 }, { "epoch": 0.30538922155688625, "grad_norm": 1.6796194211057058, "learning_rate": 8.141634119820101e-06, "loss": 0.0287, "step": 1071 }, { "epoch": 0.3056743655546051, "grad_norm": 1.2891380669074353, "learning_rate": 8.138039718563578e-06, "loss": 0.0492, "step": 1072 }, { "epoch": 0.3059595095523239, "grad_norm": 2.357993042502323, "learning_rate": 8.134442639712063e-06, "loss": 0.0456, "step": 1073 }, { "epoch": 0.30624465355004277, "grad_norm": 0.5191000758439441, "learning_rate": 8.130842886334837e-06, "loss": 0.0083, "step": 1074 }, { "epoch": 0.3065297975477616, "grad_norm": 0.9634677131357932, "learning_rate": 8.127240461503462e-06, "loss": 0.0159, "step": 1075 }, { "epoch": 0.30681494154548045, "grad_norm": 0.27264204707454004, "learning_rate": 8.123635368291777e-06, "loss": 0.0051, "step": 1076 }, { "epoch": 0.30710008554319934, "grad_norm": 2.2718130414005446, "learning_rate": 8.120027609775902e-06, "loss": 0.04, "step": 1077 }, { "epoch": 0.3073852295409182, "grad_norm": 1.2599259798035607, "learning_rate": 8.116417189034227e-06, "loss": 0.0234, "step": 1078 }, { "epoch": 0.307670373538637, "grad_norm": 1.2003202010441418, "learning_rate": 8.112804109147416e-06, "loss": 0.0137, "step": 1079 }, { "epoch": 0.30795551753635586, "grad_norm": 1.525563420275153, "learning_rate": 8.1091883731984e-06, "loss": 0.0452, "step": 1080 }, { "epoch": 0.3082406615340747, "grad_norm": 1.8645400213251933, "learning_rate": 8.10556998427238e-06, "loss": 0.0267, "step": 1081 }, { "epoch": 0.30852580553179354, "grad_norm": 2.063747391446373, "learning_rate": 8.10194894545682e-06, "loss": 0.0357, "step": 1082 }, { "epoch": 0.3088109495295124, "grad_norm": 1.2254249014948786, "learning_rate": 8.09832525984144e-06, "loss": 0.0226, "step": 1083 }, { "epoch": 0.3090960935272313, "grad_norm": 1.4378801385084123, "learning_rate": 8.094698930518224e-06, "loss": 0.0207, "step": 1084 }, { "epoch": 0.3093812375249501, "grad_norm": 1.6399576082537426, "learning_rate": 8.091069960581408e-06, "loss": 0.0208, "step": 1085 }, { "epoch": 0.30966638152266895, "grad_norm": 0.4934963525040569, "learning_rate": 8.087438353127486e-06, "loss": 0.0065, "step": 1086 }, { "epoch": 0.3099515255203878, "grad_norm": 1.009916460402547, "learning_rate": 8.083804111255197e-06, "loss": 0.0179, "step": 1087 }, { "epoch": 0.31023666951810663, "grad_norm": 0.8948637487646737, "learning_rate": 8.08016723806553e-06, "loss": 0.0202, "step": 1088 }, { "epoch": 0.3105218135158255, "grad_norm": 1.3076962346304433, "learning_rate": 8.076527736661724e-06, "loss": 0.0362, "step": 1089 }, { "epoch": 0.3108069575135443, "grad_norm": 2.205028660931532, "learning_rate": 8.072885610149251e-06, "loss": 0.0309, "step": 1090 }, { "epoch": 0.3110921015112632, "grad_norm": 1.4313613753949053, "learning_rate": 8.06924086163583e-06, "loss": 0.021, "step": 1091 }, { "epoch": 0.31137724550898205, "grad_norm": 2.054475718794255, "learning_rate": 8.065593494231418e-06, "loss": 0.0468, "step": 1092 }, { "epoch": 0.3116623895067009, "grad_norm": 1.7740986866927582, "learning_rate": 8.061943511048199e-06, "loss": 0.0516, "step": 1093 }, { "epoch": 0.3119475335044197, "grad_norm": 46.640258129930466, "learning_rate": 8.058290915200597e-06, "loss": 0.1971, "step": 1094 }, { "epoch": 0.31223267750213857, "grad_norm": 0.781762262501865, "learning_rate": 8.054635709805263e-06, "loss": 0.0248, "step": 1095 }, { "epoch": 0.3125178214998574, "grad_norm": 1.082335674544725, "learning_rate": 8.050977897981071e-06, "loss": 0.0357, "step": 1096 }, { "epoch": 0.3128029654975763, "grad_norm": 1.7052642583096669, "learning_rate": 8.047317482849124e-06, "loss": 0.0305, "step": 1097 }, { "epoch": 0.31308810949529514, "grad_norm": 1.9320064281915101, "learning_rate": 8.043654467532744e-06, "loss": 0.0486, "step": 1098 }, { "epoch": 0.313373253493014, "grad_norm": 2.6104512138319564, "learning_rate": 8.039988855157472e-06, "loss": 0.0608, "step": 1099 }, { "epoch": 0.3136583974907328, "grad_norm": 0.906309465890772, "learning_rate": 8.036320648851064e-06, "loss": 0.0188, "step": 1100 }, { "epoch": 0.31394354148845166, "grad_norm": 1.2528075290998113, "learning_rate": 8.032649851743493e-06, "loss": 0.0458, "step": 1101 }, { "epoch": 0.3142286854861705, "grad_norm": 1.7482113067259941, "learning_rate": 8.028976466966934e-06, "loss": 0.0369, "step": 1102 }, { "epoch": 0.31451382948388934, "grad_norm": 2.9189918710398755, "learning_rate": 8.025300497655783e-06, "loss": 0.0377, "step": 1103 }, { "epoch": 0.31479897348160824, "grad_norm": 1.0983245681069267, "learning_rate": 8.021621946946628e-06, "loss": 0.0242, "step": 1104 }, { "epoch": 0.3150841174793271, "grad_norm": 1.2121132358162416, "learning_rate": 8.01794081797827e-06, "loss": 0.0335, "step": 1105 }, { "epoch": 0.3153692614770459, "grad_norm": 0.8067611328102654, "learning_rate": 8.014257113891704e-06, "loss": 0.0301, "step": 1106 }, { "epoch": 0.31565440547476475, "grad_norm": 0.8529627858655344, "learning_rate": 8.010570837830124e-06, "loss": 0.0367, "step": 1107 }, { "epoch": 0.3159395494724836, "grad_norm": 0.8676313074444348, "learning_rate": 8.00688199293892e-06, "loss": 0.019, "step": 1108 }, { "epoch": 0.31622469347020243, "grad_norm": 0.7427102352300117, "learning_rate": 8.003190582365669e-06, "loss": 0.0262, "step": 1109 }, { "epoch": 0.3165098374679213, "grad_norm": 0.9736769790238554, "learning_rate": 7.999496609260144e-06, "loss": 0.0184, "step": 1110 }, { "epoch": 0.31679498146564017, "grad_norm": 1.27619488452561, "learning_rate": 7.995800076774301e-06, "loss": 0.0232, "step": 1111 }, { "epoch": 0.317080125463359, "grad_norm": 2.0389975152307476, "learning_rate": 7.992100988062277e-06, "loss": 0.0393, "step": 1112 }, { "epoch": 0.31736526946107785, "grad_norm": 0.7124345990268323, "learning_rate": 7.988399346280398e-06, "loss": 0.0157, "step": 1113 }, { "epoch": 0.3176504134587967, "grad_norm": 0.7642116938852253, "learning_rate": 7.98469515458716e-06, "loss": 0.0239, "step": 1114 }, { "epoch": 0.3179355574565155, "grad_norm": 0.8303694050770815, "learning_rate": 7.980988416143239e-06, "loss": 0.0188, "step": 1115 }, { "epoch": 0.31822070145423437, "grad_norm": 1.7299262380985978, "learning_rate": 7.977279134111487e-06, "loss": 0.0384, "step": 1116 }, { "epoch": 0.31850584545195326, "grad_norm": 0.821877902826892, "learning_rate": 7.973567311656917e-06, "loss": 0.0152, "step": 1117 }, { "epoch": 0.3187909894496721, "grad_norm": 0.841185794140611, "learning_rate": 7.96985295194672e-06, "loss": 0.0213, "step": 1118 }, { "epoch": 0.31907613344739094, "grad_norm": 2.9117336343809694, "learning_rate": 7.966136058150247e-06, "loss": 0.0337, "step": 1119 }, { "epoch": 0.3193612774451098, "grad_norm": 0.37780063918483164, "learning_rate": 7.962416633439008e-06, "loss": 0.0143, "step": 1120 }, { "epoch": 0.3196464214428286, "grad_norm": 1.1660782504301215, "learning_rate": 7.958694680986682e-06, "loss": 0.0258, "step": 1121 }, { "epoch": 0.31993156544054746, "grad_norm": 0.6348660624052973, "learning_rate": 7.954970203969095e-06, "loss": 0.0401, "step": 1122 }, { "epoch": 0.3202167094382663, "grad_norm": 0.5893537112468747, "learning_rate": 7.951243205564234e-06, "loss": 0.0106, "step": 1123 }, { "epoch": 0.3205018534359852, "grad_norm": 0.7759687299701817, "learning_rate": 7.947513688952234e-06, "loss": 0.0202, "step": 1124 }, { "epoch": 0.32078699743370404, "grad_norm": 1.7145098320011822, "learning_rate": 7.943781657315377e-06, "loss": 0.0227, "step": 1125 }, { "epoch": 0.3210721414314229, "grad_norm": 1.8907573081270583, "learning_rate": 7.940047113838096e-06, "loss": 0.0218, "step": 1126 }, { "epoch": 0.3213572854291417, "grad_norm": 1.1764345975362007, "learning_rate": 7.936310061706965e-06, "loss": 0.0399, "step": 1127 }, { "epoch": 0.32164242942686055, "grad_norm": 0.8821811693487115, "learning_rate": 7.932570504110697e-06, "loss": 0.0252, "step": 1128 }, { "epoch": 0.3219275734245794, "grad_norm": 0.8404345204905768, "learning_rate": 7.928828444240144e-06, "loss": 0.0273, "step": 1129 }, { "epoch": 0.32221271742229823, "grad_norm": 0.714789071782589, "learning_rate": 7.925083885288296e-06, "loss": 0.0165, "step": 1130 }, { "epoch": 0.32249786142001713, "grad_norm": 0.9439440999089335, "learning_rate": 7.921336830450268e-06, "loss": 0.0174, "step": 1131 }, { "epoch": 0.32278300541773597, "grad_norm": 1.9591792214351356, "learning_rate": 7.917587282923312e-06, "loss": 0.0492, "step": 1132 }, { "epoch": 0.3230681494154548, "grad_norm": 0.9159150616345992, "learning_rate": 7.913835245906805e-06, "loss": 0.0174, "step": 1133 }, { "epoch": 0.32335329341317365, "grad_norm": 1.6772121272739662, "learning_rate": 7.910080722602245e-06, "loss": 0.0219, "step": 1134 }, { "epoch": 0.3236384374108925, "grad_norm": 1.1616225108993936, "learning_rate": 7.906323716213256e-06, "loss": 0.0263, "step": 1135 }, { "epoch": 0.3239235814086113, "grad_norm": 1.151082996421702, "learning_rate": 7.902564229945577e-06, "loss": 0.0111, "step": 1136 }, { "epoch": 0.3242087254063302, "grad_norm": 1.2986081859094945, "learning_rate": 7.898802267007067e-06, "loss": 0.0116, "step": 1137 }, { "epoch": 0.32449386940404906, "grad_norm": 1.3947759655941292, "learning_rate": 7.895037830607692e-06, "loss": 0.0389, "step": 1138 }, { "epoch": 0.3247790134017679, "grad_norm": 1.0484963530466218, "learning_rate": 7.891270923959537e-06, "loss": 0.014, "step": 1139 }, { "epoch": 0.32506415739948674, "grad_norm": 1.504826560429216, "learning_rate": 7.887501550276789e-06, "loss": 0.0423, "step": 1140 }, { "epoch": 0.3253493013972056, "grad_norm": 1.5608082869525692, "learning_rate": 7.88372971277574e-06, "loss": 0.0684, "step": 1141 }, { "epoch": 0.3256344453949244, "grad_norm": 2.7628731807480653, "learning_rate": 7.879955414674784e-06, "loss": 0.0681, "step": 1142 }, { "epoch": 0.32591958939264326, "grad_norm": 1.613849056092281, "learning_rate": 7.87617865919442e-06, "loss": 0.0463, "step": 1143 }, { "epoch": 0.32620473339036216, "grad_norm": 2.5686764919053124, "learning_rate": 7.872399449557238e-06, "loss": 0.0536, "step": 1144 }, { "epoch": 0.326489877388081, "grad_norm": 1.4183432016049962, "learning_rate": 7.868617788987925e-06, "loss": 0.0285, "step": 1145 }, { "epoch": 0.32677502138579984, "grad_norm": 1.7124522563822497, "learning_rate": 7.864833680713256e-06, "loss": 0.0593, "step": 1146 }, { "epoch": 0.3270601653835187, "grad_norm": 1.2283635139885853, "learning_rate": 7.861047127962099e-06, "loss": 0.0321, "step": 1147 }, { "epoch": 0.3273453093812375, "grad_norm": 1.6249903419870981, "learning_rate": 7.857258133965405e-06, "loss": 0.0399, "step": 1148 }, { "epoch": 0.32763045337895635, "grad_norm": 3.053147088970876, "learning_rate": 7.853466701956208e-06, "loss": 0.0531, "step": 1149 }, { "epoch": 0.3279155973766752, "grad_norm": 24.31024864362681, "learning_rate": 7.849672835169625e-06, "loss": 0.1359, "step": 1150 }, { "epoch": 0.3282007413743941, "grad_norm": 1.696805065592631, "learning_rate": 7.845876536842846e-06, "loss": 0.0418, "step": 1151 }, { "epoch": 0.32848588537211293, "grad_norm": 1.3445704598716866, "learning_rate": 7.84207781021514e-06, "loss": 0.0254, "step": 1152 }, { "epoch": 0.32877102936983177, "grad_norm": 2.8671303035698616, "learning_rate": 7.838276658527847e-06, "loss": 0.0395, "step": 1153 }, { "epoch": 0.3290561733675506, "grad_norm": 2.130710041966832, "learning_rate": 7.834473085024373e-06, "loss": 0.0316, "step": 1154 }, { "epoch": 0.32934131736526945, "grad_norm": 2.6888945875232424, "learning_rate": 7.830667092950195e-06, "loss": 0.0586, "step": 1155 }, { "epoch": 0.3296264613629883, "grad_norm": 1.9325581252131516, "learning_rate": 7.826858685552851e-06, "loss": 0.0405, "step": 1156 }, { "epoch": 0.3299116053607072, "grad_norm": 2.1097020635632804, "learning_rate": 7.82304786608194e-06, "loss": 0.0237, "step": 1157 }, { "epoch": 0.330196749358426, "grad_norm": 1.0799679655070387, "learning_rate": 7.819234637789122e-06, "loss": 0.0218, "step": 1158 }, { "epoch": 0.33048189335614486, "grad_norm": 1.0940971424110126, "learning_rate": 7.815419003928107e-06, "loss": 0.0156, "step": 1159 }, { "epoch": 0.3307670373538637, "grad_norm": 0.45652875771745444, "learning_rate": 7.811600967754661e-06, "loss": 0.018, "step": 1160 }, { "epoch": 0.33105218135158254, "grad_norm": 1.5961724407974645, "learning_rate": 7.807780532526604e-06, "loss": 0.0442, "step": 1161 }, { "epoch": 0.3313373253493014, "grad_norm": 1.5457552919670423, "learning_rate": 7.80395770150379e-06, "loss": 0.035, "step": 1162 }, { "epoch": 0.3316224693470202, "grad_norm": 1.5335975806195943, "learning_rate": 7.800132477948137e-06, "loss": 0.0428, "step": 1163 }, { "epoch": 0.3319076133447391, "grad_norm": 1.791656104690741, "learning_rate": 7.796304865123583e-06, "loss": 0.0402, "step": 1164 }, { "epoch": 0.33219275734245796, "grad_norm": 0.8626638971656029, "learning_rate": 7.79247486629612e-06, "loss": 0.0207, "step": 1165 }, { "epoch": 0.3324779013401768, "grad_norm": 1.3786813993640084, "learning_rate": 7.788642484733773e-06, "loss": 0.0305, "step": 1166 }, { "epoch": 0.33276304533789564, "grad_norm": 0.9962974441495369, "learning_rate": 7.784807723706593e-06, "loss": 0.0369, "step": 1167 }, { "epoch": 0.3330481893356145, "grad_norm": 0.8448490830062997, "learning_rate": 7.780970586486668e-06, "loss": 0.032, "step": 1168 }, { "epoch": 0.3333333333333333, "grad_norm": 141.46865931940235, "learning_rate": 7.777131076348115e-06, "loss": 0.5834, "step": 1169 }, { "epoch": 0.33361847733105215, "grad_norm": 0.6240936201732423, "learning_rate": 7.773289196567066e-06, "loss": 0.0202, "step": 1170 }, { "epoch": 0.33390362132877105, "grad_norm": 1.6855103583591178, "learning_rate": 7.76944495042169e-06, "loss": 0.031, "step": 1171 }, { "epoch": 0.3341887653264899, "grad_norm": 1.061334418928402, "learning_rate": 7.76559834119216e-06, "loss": 0.0393, "step": 1172 }, { "epoch": 0.33447390932420873, "grad_norm": 2.3860632467930434, "learning_rate": 7.761749372160676e-06, "loss": 0.0546, "step": 1173 }, { "epoch": 0.33475905332192757, "grad_norm": 0.7787717819480605, "learning_rate": 7.757898046611446e-06, "loss": 0.0196, "step": 1174 }, { "epoch": 0.3350441973196464, "grad_norm": 1.1796680979025957, "learning_rate": 7.754044367830689e-06, "loss": 0.038, "step": 1175 }, { "epoch": 0.33532934131736525, "grad_norm": 1.3148557580390934, "learning_rate": 7.750188339106635e-06, "loss": 0.0304, "step": 1176 }, { "epoch": 0.33561448531508414, "grad_norm": 1.2054539450580688, "learning_rate": 7.746329963729517e-06, "loss": 0.0228, "step": 1177 }, { "epoch": 0.335899629312803, "grad_norm": 2.0449048447789386, "learning_rate": 7.742469244991572e-06, "loss": 0.0661, "step": 1178 }, { "epoch": 0.3361847733105218, "grad_norm": 0.5740583915566436, "learning_rate": 7.738606186187034e-06, "loss": 0.0179, "step": 1179 }, { "epoch": 0.33646991730824066, "grad_norm": 0.6910488636543247, "learning_rate": 7.734740790612137e-06, "loss": 0.0277, "step": 1180 }, { "epoch": 0.3367550613059595, "grad_norm": 1.0694033044260076, "learning_rate": 7.730873061565101e-06, "loss": 0.0224, "step": 1181 }, { "epoch": 0.33704020530367834, "grad_norm": 0.790204261875235, "learning_rate": 7.72700300234615e-06, "loss": 0.0248, "step": 1182 }, { "epoch": 0.3373253493013972, "grad_norm": 1.33625424302328, "learning_rate": 7.723130616257485e-06, "loss": 0.0274, "step": 1183 }, { "epoch": 0.3376104932991161, "grad_norm": 1.0567353013473344, "learning_rate": 7.719255906603298e-06, "loss": 0.0232, "step": 1184 }, { "epoch": 0.3378956372968349, "grad_norm": 0.30339439930941897, "learning_rate": 7.715378876689763e-06, "loss": 0.0076, "step": 1185 }, { "epoch": 0.33818078129455376, "grad_norm": 0.6213587414411561, "learning_rate": 7.711499529825032e-06, "loss": 0.0182, "step": 1186 }, { "epoch": 0.3384659252922726, "grad_norm": 1.6039813029461647, "learning_rate": 7.707617869319235e-06, "loss": 0.041, "step": 1187 }, { "epoch": 0.33875106928999144, "grad_norm": 0.9910996874197118, "learning_rate": 7.703733898484479e-06, "loss": 0.0186, "step": 1188 }, { "epoch": 0.3390362132877103, "grad_norm": 0.4274479084393529, "learning_rate": 7.699847620634834e-06, "loss": 0.0073, "step": 1189 }, { "epoch": 0.3393213572854291, "grad_norm": 1.1140118331424291, "learning_rate": 7.695959039086349e-06, "loss": 0.0169, "step": 1190 }, { "epoch": 0.339606501283148, "grad_norm": 0.7561760159586366, "learning_rate": 7.692068157157032e-06, "loss": 0.0165, "step": 1191 }, { "epoch": 0.33989164528086685, "grad_norm": 0.5463013328367404, "learning_rate": 7.688174978166855e-06, "loss": 0.0118, "step": 1192 }, { "epoch": 0.3401767892785857, "grad_norm": 1.1686541927437784, "learning_rate": 7.684279505437754e-06, "loss": 0.0183, "step": 1193 }, { "epoch": 0.34046193327630453, "grad_norm": 1.5635211370198485, "learning_rate": 7.680381742293615e-06, "loss": 0.0548, "step": 1194 }, { "epoch": 0.34074707727402337, "grad_norm": 1.3537452233019234, "learning_rate": 7.676481692060284e-06, "loss": 0.024, "step": 1195 }, { "epoch": 0.3410322212717422, "grad_norm": 0.7266402871499255, "learning_rate": 7.672579358065554e-06, "loss": 0.0175, "step": 1196 }, { "epoch": 0.3413173652694611, "grad_norm": 0.5773819066167983, "learning_rate": 7.668674743639173e-06, "loss": 0.0158, "step": 1197 }, { "epoch": 0.34160250926717994, "grad_norm": 1.1945971562545137, "learning_rate": 7.66476785211283e-06, "loss": 0.0232, "step": 1198 }, { "epoch": 0.3418876532648988, "grad_norm": 1.7947296038208627, "learning_rate": 7.660858686820157e-06, "loss": 0.0372, "step": 1199 }, { "epoch": 0.3421727972626176, "grad_norm": 1.2339145196734935, "learning_rate": 7.656947251096729e-06, "loss": 0.0301, "step": 1200 }, { "epoch": 0.34245794126033646, "grad_norm": 1.4239632584351836, "learning_rate": 7.653033548280056e-06, "loss": 0.0223, "step": 1201 }, { "epoch": 0.3427430852580553, "grad_norm": 3.5661005357954902, "learning_rate": 7.649117581709581e-06, "loss": 0.0782, "step": 1202 }, { "epoch": 0.34302822925577414, "grad_norm": 0.4606205342130907, "learning_rate": 7.645199354726681e-06, "loss": 0.008, "step": 1203 }, { "epoch": 0.34331337325349304, "grad_norm": 1.1808740305202428, "learning_rate": 7.641278870674664e-06, "loss": 0.0138, "step": 1204 }, { "epoch": 0.3435985172512119, "grad_norm": 1.181312784517819, "learning_rate": 7.637356132898762e-06, "loss": 0.0113, "step": 1205 }, { "epoch": 0.3438836612489307, "grad_norm": 1.9393291910241433, "learning_rate": 7.633431144746123e-06, "loss": 0.0366, "step": 1206 }, { "epoch": 0.34416880524664956, "grad_norm": 0.5655309202758987, "learning_rate": 7.629503909565829e-06, "loss": 0.0084, "step": 1207 }, { "epoch": 0.3444539492443684, "grad_norm": 0.9828332776151091, "learning_rate": 7.625574430708867e-06, "loss": 0.0436, "step": 1208 }, { "epoch": 0.34473909324208724, "grad_norm": 1.9944966409431035, "learning_rate": 7.621642711528143e-06, "loss": 0.0303, "step": 1209 }, { "epoch": 0.3450242372398061, "grad_norm": 0.7784849227021717, "learning_rate": 7.617708755378477e-06, "loss": 0.0121, "step": 1210 }, { "epoch": 0.34530938123752497, "grad_norm": 1.4230792460322752, "learning_rate": 7.613772565616595e-06, "loss": 0.0213, "step": 1211 }, { "epoch": 0.3455945252352438, "grad_norm": 1.1879362008117293, "learning_rate": 7.609834145601129e-06, "loss": 0.0331, "step": 1212 }, { "epoch": 0.34587966923296265, "grad_norm": 0.9145028403418679, "learning_rate": 7.605893498692616e-06, "loss": 0.0103, "step": 1213 }, { "epoch": 0.3461648132306815, "grad_norm": 0.8655139809708533, "learning_rate": 7.601950628253489e-06, "loss": 0.023, "step": 1214 }, { "epoch": 0.34644995722840033, "grad_norm": 1.7832506022566148, "learning_rate": 7.598005537648082e-06, "loss": 0.0564, "step": 1215 }, { "epoch": 0.34673510122611917, "grad_norm": 2.1239358299514586, "learning_rate": 7.594058230242623e-06, "loss": 0.0366, "step": 1216 }, { "epoch": 0.34702024522383806, "grad_norm": 0.878389399490451, "learning_rate": 7.59010870940523e-06, "loss": 0.0119, "step": 1217 }, { "epoch": 0.3473053892215569, "grad_norm": 2.733265706518621, "learning_rate": 7.58615697850591e-06, "loss": 0.0543, "step": 1218 }, { "epoch": 0.34759053321927574, "grad_norm": 0.7518484980410628, "learning_rate": 7.582203040916558e-06, "loss": 0.0129, "step": 1219 }, { "epoch": 0.3478756772169946, "grad_norm": 0.9989613237234266, "learning_rate": 7.578246900010948e-06, "loss": 0.0128, "step": 1220 }, { "epoch": 0.3481608212147134, "grad_norm": 1.0141029301162927, "learning_rate": 7.57428855916474e-06, "loss": 0.0225, "step": 1221 }, { "epoch": 0.34844596521243226, "grad_norm": 1.820213625141049, "learning_rate": 7.5703280217554594e-06, "loss": 0.0539, "step": 1222 }, { "epoch": 0.3487311092101511, "grad_norm": 0.4916304901754792, "learning_rate": 7.566365291162523e-06, "loss": 0.0087, "step": 1223 }, { "epoch": 0.34901625320787, "grad_norm": 1.5289568499969866, "learning_rate": 7.5624003707672036e-06, "loss": 0.0262, "step": 1224 }, { "epoch": 0.34930139720558884, "grad_norm": 0.6704716345946136, "learning_rate": 7.558433263952652e-06, "loss": 0.0263, "step": 1225 }, { "epoch": 0.3495865412033077, "grad_norm": 0.8476756802724102, "learning_rate": 7.554463974103876e-06, "loss": 0.0091, "step": 1226 }, { "epoch": 0.3498716852010265, "grad_norm": 0.5944495719636592, "learning_rate": 7.5504925046077596e-06, "loss": 0.0154, "step": 1227 }, { "epoch": 0.35015682919874536, "grad_norm": 1.545818848015318, "learning_rate": 7.546518858853032e-06, "loss": 0.0341, "step": 1228 }, { "epoch": 0.3504419731964642, "grad_norm": 1.9221079880766434, "learning_rate": 7.542543040230287e-06, "loss": 0.0332, "step": 1229 }, { "epoch": 0.35072711719418304, "grad_norm": 1.9442964587621157, "learning_rate": 7.538565052131972e-06, "loss": 0.0424, "step": 1230 }, { "epoch": 0.35101226119190193, "grad_norm": 2.1530989721825544, "learning_rate": 7.534584897952385e-06, "loss": 0.0644, "step": 1231 }, { "epoch": 0.35129740518962077, "grad_norm": 1.0592053494006788, "learning_rate": 7.530602581087672e-06, "loss": 0.0418, "step": 1232 }, { "epoch": 0.3515825491873396, "grad_norm": 2.701165505211902, "learning_rate": 7.526618104935824e-06, "loss": 0.0423, "step": 1233 }, { "epoch": 0.35186769318505845, "grad_norm": 1.9681046663599653, "learning_rate": 7.522631472896671e-06, "loss": 0.0292, "step": 1234 }, { "epoch": 0.3521528371827773, "grad_norm": 1.0834902051848114, "learning_rate": 7.518642688371893e-06, "loss": 0.0165, "step": 1235 }, { "epoch": 0.35243798118049613, "grad_norm": 0.729111577118215, "learning_rate": 7.514651754764996e-06, "loss": 0.0113, "step": 1236 }, { "epoch": 0.352723125178215, "grad_norm": 0.9677835962247436, "learning_rate": 7.510658675481324e-06, "loss": 0.0089, "step": 1237 }, { "epoch": 0.35300826917593386, "grad_norm": 1.7687113644828218, "learning_rate": 7.5066634539280524e-06, "loss": 0.0368, "step": 1238 }, { "epoch": 0.3532934131736527, "grad_norm": 0.8467747355722813, "learning_rate": 7.502666093514184e-06, "loss": 0.0212, "step": 1239 }, { "epoch": 0.35357855717137154, "grad_norm": 1.1715828228303156, "learning_rate": 7.498666597650544e-06, "loss": 0.0316, "step": 1240 }, { "epoch": 0.3538637011690904, "grad_norm": 0.9081021165247576, "learning_rate": 7.494664969749785e-06, "loss": 0.0227, "step": 1241 }, { "epoch": 0.3541488451668092, "grad_norm": 1.1369108712808864, "learning_rate": 7.490661213226374e-06, "loss": 0.0359, "step": 1242 }, { "epoch": 0.35443398916452806, "grad_norm": 2.0527453246581557, "learning_rate": 7.486655331496597e-06, "loss": 0.0355, "step": 1243 }, { "epoch": 0.35471913316224696, "grad_norm": 1.0630257009017088, "learning_rate": 7.482647327978551e-06, "loss": 0.022, "step": 1244 }, { "epoch": 0.3550042771599658, "grad_norm": 1.1552022033934928, "learning_rate": 7.478637206092147e-06, "loss": 0.0244, "step": 1245 }, { "epoch": 0.35528942115768464, "grad_norm": 1.2041123400974558, "learning_rate": 7.474624969259101e-06, "loss": 0.0311, "step": 1246 }, { "epoch": 0.3555745651554035, "grad_norm": 0.6912486293307928, "learning_rate": 7.470610620902934e-06, "loss": 0.016, "step": 1247 }, { "epoch": 0.3558597091531223, "grad_norm": 0.7069761757438487, "learning_rate": 7.466594164448967e-06, "loss": 0.0121, "step": 1248 }, { "epoch": 0.35614485315084116, "grad_norm": 2.406861730589254, "learning_rate": 7.462575603324325e-06, "loss": 0.0573, "step": 1249 }, { "epoch": 0.35642999714856, "grad_norm": 1.073491482084253, "learning_rate": 7.458554940957922e-06, "loss": 0.0285, "step": 1250 }, { "epoch": 0.3567151411462789, "grad_norm": 1.4809174105565874, "learning_rate": 7.45453218078047e-06, "loss": 0.0187, "step": 1251 }, { "epoch": 0.35700028514399773, "grad_norm": 2.5263068063454104, "learning_rate": 7.450507326224469e-06, "loss": 0.0346, "step": 1252 }, { "epoch": 0.35728542914171657, "grad_norm": 1.0743279609219285, "learning_rate": 7.446480380724208e-06, "loss": 0.015, "step": 1253 }, { "epoch": 0.3575705731394354, "grad_norm": 1.2267919789411827, "learning_rate": 7.442451347715758e-06, "loss": 0.041, "step": 1254 }, { "epoch": 0.35785571713715425, "grad_norm": 1.0637373721906973, "learning_rate": 7.438420230636969e-06, "loss": 0.0161, "step": 1255 }, { "epoch": 0.3581408611348731, "grad_norm": 1.8824790740102795, "learning_rate": 7.434387032927475e-06, "loss": 0.0358, "step": 1256 }, { "epoch": 0.358426005132592, "grad_norm": 1.2092687912850297, "learning_rate": 7.430351758028682e-06, "loss": 0.0301, "step": 1257 }, { "epoch": 0.3587111491303108, "grad_norm": 0.6835211094012288, "learning_rate": 7.426314409383768e-06, "loss": 0.0062, "step": 1258 }, { "epoch": 0.35899629312802966, "grad_norm": 1.3255750122183771, "learning_rate": 7.422274990437682e-06, "loss": 0.0318, "step": 1259 }, { "epoch": 0.3592814371257485, "grad_norm": 0.9156502566543411, "learning_rate": 7.418233504637138e-06, "loss": 0.0143, "step": 1260 }, { "epoch": 0.35956658112346734, "grad_norm": 1.1727165754738422, "learning_rate": 7.414189955430615e-06, "loss": 0.0369, "step": 1261 }, { "epoch": 0.3598517251211862, "grad_norm": 1.4468665217354895, "learning_rate": 7.410144346268351e-06, "loss": 0.0417, "step": 1262 }, { "epoch": 0.360136869118905, "grad_norm": 1.824457712002065, "learning_rate": 7.4060966806023445e-06, "loss": 0.0353, "step": 1263 }, { "epoch": 0.3604220131166239, "grad_norm": 0.9767535191086671, "learning_rate": 7.4020469618863455e-06, "loss": 0.021, "step": 1264 }, { "epoch": 0.36070715711434276, "grad_norm": 1.079593807537144, "learning_rate": 7.3979951935758596e-06, "loss": 0.0178, "step": 1265 }, { "epoch": 0.3609923011120616, "grad_norm": 1.6139308826860321, "learning_rate": 7.393941379128136e-06, "loss": 0.0293, "step": 1266 }, { "epoch": 0.36127744510978044, "grad_norm": 1.1527677219155397, "learning_rate": 7.3898855220021734e-06, "loss": 0.032, "step": 1267 }, { "epoch": 0.3615625891074993, "grad_norm": 1.5856659207811066, "learning_rate": 7.385827625658713e-06, "loss": 0.0261, "step": 1268 }, { "epoch": 0.3618477331052181, "grad_norm": 1.086362153999573, "learning_rate": 7.3817676935602376e-06, "loss": 0.0128, "step": 1269 }, { "epoch": 0.36213287710293696, "grad_norm": 1.6988841433717032, "learning_rate": 7.377705729170962e-06, "loss": 0.0371, "step": 1270 }, { "epoch": 0.36241802110065585, "grad_norm": 1.4325229855389487, "learning_rate": 7.373641735956843e-06, "loss": 0.0227, "step": 1271 }, { "epoch": 0.3627031650983747, "grad_norm": 0.8830703221340672, "learning_rate": 7.369575717385557e-06, "loss": 0.019, "step": 1272 }, { "epoch": 0.36298830909609353, "grad_norm": 2.209036557846552, "learning_rate": 7.365507676926523e-06, "loss": 0.0346, "step": 1273 }, { "epoch": 0.36327345309381237, "grad_norm": 1.5334503346203225, "learning_rate": 7.361437618050873e-06, "loss": 0.0407, "step": 1274 }, { "epoch": 0.3635585970915312, "grad_norm": 0.8070701947902329, "learning_rate": 7.3573655442314674e-06, "loss": 0.0324, "step": 1275 }, { "epoch": 0.36384374108925005, "grad_norm": 1.430194693309841, "learning_rate": 7.353291458942884e-06, "loss": 0.0196, "step": 1276 }, { "epoch": 0.36412888508696895, "grad_norm": 0.3562037606493946, "learning_rate": 7.349215365661417e-06, "loss": 0.0126, "step": 1277 }, { "epoch": 0.3644140290846878, "grad_norm": 1.1955582337051391, "learning_rate": 7.345137267865075e-06, "loss": 0.0287, "step": 1278 }, { "epoch": 0.3646991730824066, "grad_norm": 1.5568689189039489, "learning_rate": 7.341057169033576e-06, "loss": 0.0283, "step": 1279 }, { "epoch": 0.36498431708012546, "grad_norm": 1.3478008946354623, "learning_rate": 7.336975072648346e-06, "loss": 0.0288, "step": 1280 }, { "epoch": 0.3652694610778443, "grad_norm": 1.0580096876065161, "learning_rate": 7.332890982192514e-06, "loss": 0.0166, "step": 1281 }, { "epoch": 0.36555460507556314, "grad_norm": 0.5686865684902279, "learning_rate": 7.328804901150914e-06, "loss": 0.0116, "step": 1282 }, { "epoch": 0.365839749073282, "grad_norm": 1.716366052959552, "learning_rate": 7.324716833010074e-06, "loss": 0.0467, "step": 1283 }, { "epoch": 0.3661248930710009, "grad_norm": 0.7791990463845944, "learning_rate": 7.32062678125822e-06, "loss": 0.0111, "step": 1284 }, { "epoch": 0.3664100370687197, "grad_norm": 1.3893700452732172, "learning_rate": 7.316534749385272e-06, "loss": 0.0194, "step": 1285 }, { "epoch": 0.36669518106643856, "grad_norm": 1.5762832721071067, "learning_rate": 7.312440740882836e-06, "loss": 0.0238, "step": 1286 }, { "epoch": 0.3669803250641574, "grad_norm": 1.0209692180493717, "learning_rate": 7.308344759244208e-06, "loss": 0.0154, "step": 1287 }, { "epoch": 0.36726546906187624, "grad_norm": 0.3701931947426716, "learning_rate": 7.304246807964363e-06, "loss": 0.0061, "step": 1288 }, { "epoch": 0.3675506130595951, "grad_norm": 0.9193845325767704, "learning_rate": 7.300146890539962e-06, "loss": 0.0277, "step": 1289 }, { "epoch": 0.3678357570573139, "grad_norm": 1.8222398498763657, "learning_rate": 7.2960450104693415e-06, "loss": 0.04, "step": 1290 }, { "epoch": 0.3681209010550328, "grad_norm": 0.643942314383188, "learning_rate": 7.291941171252512e-06, "loss": 0.0084, "step": 1291 }, { "epoch": 0.36840604505275165, "grad_norm": 2.2276882078911346, "learning_rate": 7.287835376391157e-06, "loss": 0.0351, "step": 1292 }, { "epoch": 0.3686911890504705, "grad_norm": 1.0734551871869193, "learning_rate": 7.283727629388628e-06, "loss": 0.0255, "step": 1293 }, { "epoch": 0.36897633304818933, "grad_norm": 0.9919685813925658, "learning_rate": 7.279617933749937e-06, "loss": 0.0188, "step": 1294 }, { "epoch": 0.36926147704590817, "grad_norm": 1.6546322046981001, "learning_rate": 7.275506292981771e-06, "loss": 0.032, "step": 1295 }, { "epoch": 0.369546621043627, "grad_norm": 0.6427488874964754, "learning_rate": 7.271392710592466e-06, "loss": 0.0183, "step": 1296 }, { "epoch": 0.3698317650413459, "grad_norm": 2.1105811666825605, "learning_rate": 7.2672771900920195e-06, "loss": 0.0342, "step": 1297 }, { "epoch": 0.37011690903906475, "grad_norm": 1.7984565917708282, "learning_rate": 7.263159734992079e-06, "loss": 0.0358, "step": 1298 }, { "epoch": 0.3704020530367836, "grad_norm": 2.0892827974495973, "learning_rate": 7.259040348805948e-06, "loss": 0.0637, "step": 1299 }, { "epoch": 0.3706871970345024, "grad_norm": 1.9245223100243691, "learning_rate": 7.2549190350485734e-06, "loss": 0.0256, "step": 1300 }, { "epoch": 0.37097234103222126, "grad_norm": 1.332108407780507, "learning_rate": 7.250795797236549e-06, "loss": 0.0288, "step": 1301 }, { "epoch": 0.3712574850299401, "grad_norm": 2.832303589679555, "learning_rate": 7.246670638888109e-06, "loss": 0.0746, "step": 1302 }, { "epoch": 0.37154262902765894, "grad_norm": 1.105425246209834, "learning_rate": 7.242543563523128e-06, "loss": 0.0386, "step": 1303 }, { "epoch": 0.37182777302537784, "grad_norm": 1.228230849256297, "learning_rate": 7.238414574663115e-06, "loss": 0.0233, "step": 1304 }, { "epoch": 0.3721129170230967, "grad_norm": 1.4222277233624963, "learning_rate": 7.234283675831212e-06, "loss": 0.0163, "step": 1305 }, { "epoch": 0.3723980610208155, "grad_norm": 1.6737164988568887, "learning_rate": 7.230150870552191e-06, "loss": 0.0436, "step": 1306 }, { "epoch": 0.37268320501853436, "grad_norm": 0.6672570457070205, "learning_rate": 7.22601616235245e-06, "loss": 0.0154, "step": 1307 }, { "epoch": 0.3729683490162532, "grad_norm": 0.5059774786468529, "learning_rate": 7.221879554760012e-06, "loss": 0.0119, "step": 1308 }, { "epoch": 0.37325349301397204, "grad_norm": 2.10429975207149, "learning_rate": 7.217741051304519e-06, "loss": 0.0672, "step": 1309 }, { "epoch": 0.3735386370116909, "grad_norm": 2.0092255665050374, "learning_rate": 7.213600655517233e-06, "loss": 0.0542, "step": 1310 }, { "epoch": 0.3738237810094098, "grad_norm": 1.352010356984478, "learning_rate": 7.209458370931029e-06, "loss": 0.0142, "step": 1311 }, { "epoch": 0.3741089250071286, "grad_norm": 1.3811098769189896, "learning_rate": 7.205314201080392e-06, "loss": 0.0243, "step": 1312 }, { "epoch": 0.37439406900484745, "grad_norm": 1.1209070881289154, "learning_rate": 7.201168149501421e-06, "loss": 0.0389, "step": 1313 }, { "epoch": 0.3746792130025663, "grad_norm": 2.065215176648648, "learning_rate": 7.197020219731814e-06, "loss": 0.0266, "step": 1314 }, { "epoch": 0.37496435700028513, "grad_norm": 2.5480094718136703, "learning_rate": 7.192870415310877e-06, "loss": 0.0328, "step": 1315 }, { "epoch": 0.37524950099800397, "grad_norm": 0.7872176078118113, "learning_rate": 7.188718739779511e-06, "loss": 0.0139, "step": 1316 }, { "epoch": 0.37553464499572287, "grad_norm": 0.6808650456171224, "learning_rate": 7.1845651966802184e-06, "loss": 0.0234, "step": 1317 }, { "epoch": 0.3758197889934417, "grad_norm": 0.49101205287280486, "learning_rate": 7.1804097895570924e-06, "loss": 0.0178, "step": 1318 }, { "epoch": 0.37610493299116055, "grad_norm": 1.0821122693778222, "learning_rate": 7.176252521955817e-06, "loss": 0.025, "step": 1319 }, { "epoch": 0.3763900769888794, "grad_norm": 1.6304437741369961, "learning_rate": 7.172093397423663e-06, "loss": 0.0422, "step": 1320 }, { "epoch": 0.3766752209865982, "grad_norm": 0.7386475855241287, "learning_rate": 7.167932419509485e-06, "loss": 0.0249, "step": 1321 }, { "epoch": 0.37696036498431706, "grad_norm": 1.2169154892764618, "learning_rate": 7.163769591763723e-06, "loss": 0.045, "step": 1322 }, { "epoch": 0.3772455089820359, "grad_norm": 1.780773790899091, "learning_rate": 7.159604917738392e-06, "loss": 0.0457, "step": 1323 }, { "epoch": 0.3775306529797548, "grad_norm": 1.3352312459837943, "learning_rate": 7.155438400987083e-06, "loss": 0.0429, "step": 1324 }, { "epoch": 0.37781579697747364, "grad_norm": 1.1305902394114138, "learning_rate": 7.151270045064958e-06, "loss": 0.0341, "step": 1325 }, { "epoch": 0.3781009409751925, "grad_norm": 1.126670135618066, "learning_rate": 7.147099853528753e-06, "loss": 0.0199, "step": 1326 }, { "epoch": 0.3783860849729113, "grad_norm": 1.137621576993765, "learning_rate": 7.142927829936766e-06, "loss": 0.0518, "step": 1327 }, { "epoch": 0.37867122897063016, "grad_norm": 1.1773282692234626, "learning_rate": 7.138753977848858e-06, "loss": 0.0279, "step": 1328 }, { "epoch": 0.378956372968349, "grad_norm": 0.5840674438803083, "learning_rate": 7.134578300826452e-06, "loss": 0.0224, "step": 1329 }, { "epoch": 0.37924151696606784, "grad_norm": 1.1129186349385658, "learning_rate": 7.130400802432529e-06, "loss": 0.0322, "step": 1330 }, { "epoch": 0.37952666096378673, "grad_norm": 1.8462980707500372, "learning_rate": 7.12622148623162e-06, "loss": 0.0342, "step": 1331 }, { "epoch": 0.3798118049615056, "grad_norm": 2.107804657803562, "learning_rate": 7.122040355789815e-06, "loss": 0.0427, "step": 1332 }, { "epoch": 0.3800969489592244, "grad_norm": 0.7450612627575915, "learning_rate": 7.117857414674741e-06, "loss": 0.0197, "step": 1333 }, { "epoch": 0.38038209295694325, "grad_norm": 1.1691762047073135, "learning_rate": 7.11367266645558e-06, "loss": 0.0276, "step": 1334 }, { "epoch": 0.3806672369546621, "grad_norm": 1.0971833274442095, "learning_rate": 7.1094861147030514e-06, "loss": 0.0326, "step": 1335 }, { "epoch": 0.38095238095238093, "grad_norm": 1.5810742872190795, "learning_rate": 7.105297762989413e-06, "loss": 0.038, "step": 1336 }, { "epoch": 0.3812375249500998, "grad_norm": 1.801470439156759, "learning_rate": 7.101107614888458e-06, "loss": 0.0214, "step": 1337 }, { "epoch": 0.38152266894781867, "grad_norm": 0.7796639176297265, "learning_rate": 7.096915673975517e-06, "loss": 0.0145, "step": 1338 }, { "epoch": 0.3818078129455375, "grad_norm": 0.3858558390410885, "learning_rate": 7.092721943827446e-06, "loss": 0.0105, "step": 1339 }, { "epoch": 0.38209295694325635, "grad_norm": 1.033014186947543, "learning_rate": 7.088526428022628e-06, "loss": 0.0281, "step": 1340 }, { "epoch": 0.3823781009409752, "grad_norm": 1.9224999850785538, "learning_rate": 7.084329130140972e-06, "loss": 0.0465, "step": 1341 }, { "epoch": 0.382663244938694, "grad_norm": 1.6708579732466118, "learning_rate": 7.080130053763906e-06, "loss": 0.0379, "step": 1342 }, { "epoch": 0.38294838893641286, "grad_norm": 0.8987241866315161, "learning_rate": 7.075929202474374e-06, "loss": 0.0251, "step": 1343 }, { "epoch": 0.38323353293413176, "grad_norm": 1.339937928755385, "learning_rate": 7.071726579856838e-06, "loss": 0.0561, "step": 1344 }, { "epoch": 0.3835186769318506, "grad_norm": 0.958679031321206, "learning_rate": 7.067522189497269e-06, "loss": 0.041, "step": 1345 }, { "epoch": 0.38380382092956944, "grad_norm": 1.3336443045217854, "learning_rate": 7.063316034983146e-06, "loss": 0.0176, "step": 1346 }, { "epoch": 0.3840889649272883, "grad_norm": 0.5339713884516041, "learning_rate": 7.059108119903455e-06, "loss": 0.0127, "step": 1347 }, { "epoch": 0.3843741089250071, "grad_norm": 0.4258851843158557, "learning_rate": 7.054898447848684e-06, "loss": 0.0169, "step": 1348 }, { "epoch": 0.38465925292272596, "grad_norm": 1.1816276456155221, "learning_rate": 7.050687022410819e-06, "loss": 0.0324, "step": 1349 }, { "epoch": 0.3849443969204448, "grad_norm": 0.8946255755547217, "learning_rate": 7.0464738471833436e-06, "loss": 0.0326, "step": 1350 }, { "epoch": 0.3852295409181637, "grad_norm": 1.1202911450635167, "learning_rate": 7.042258925761233e-06, "loss": 0.0228, "step": 1351 }, { "epoch": 0.38551468491588253, "grad_norm": 1.6902455592606156, "learning_rate": 7.038042261740952e-06, "loss": 0.0473, "step": 1352 }, { "epoch": 0.3857998289136014, "grad_norm": 0.5948513631594184, "learning_rate": 7.033823858720454e-06, "loss": 0.02, "step": 1353 }, { "epoch": 0.3860849729113202, "grad_norm": 1.7951572919018244, "learning_rate": 7.029603720299178e-06, "loss": 0.0324, "step": 1354 }, { "epoch": 0.38637011690903905, "grad_norm": 2.0429468456038555, "learning_rate": 7.025381850078037e-06, "loss": 0.0345, "step": 1355 }, { "epoch": 0.3866552609067579, "grad_norm": 0.6318621960506157, "learning_rate": 7.021158251659429e-06, "loss": 0.0163, "step": 1356 }, { "epoch": 0.3869404049044768, "grad_norm": 1.4457221608558466, "learning_rate": 7.0169329286472235e-06, "loss": 0.0377, "step": 1357 }, { "epoch": 0.3872255489021956, "grad_norm": 1.01032061892398, "learning_rate": 7.01270588464676e-06, "loss": 0.0219, "step": 1358 }, { "epoch": 0.38751069289991447, "grad_norm": 0.748496429252588, "learning_rate": 7.008477123264849e-06, "loss": 0.0324, "step": 1359 }, { "epoch": 0.3877958368976333, "grad_norm": 1.1526332961541026, "learning_rate": 7.004246648109765e-06, "loss": 0.0254, "step": 1360 }, { "epoch": 0.38808098089535215, "grad_norm": 1.4294101465113094, "learning_rate": 7.000014462791245e-06, "loss": 0.024, "step": 1361 }, { "epoch": 0.388366124893071, "grad_norm": 0.5519193648295497, "learning_rate": 6.995780570920488e-06, "loss": 0.0244, "step": 1362 }, { "epoch": 0.3886512688907898, "grad_norm": 1.054024957106729, "learning_rate": 6.991544976110144e-06, "loss": 0.0187, "step": 1363 }, { "epoch": 0.3889364128885087, "grad_norm": 1.2014758092356506, "learning_rate": 6.98730768197432e-06, "loss": 0.0396, "step": 1364 }, { "epoch": 0.38922155688622756, "grad_norm": 0.7448263297295125, "learning_rate": 6.9830686921285724e-06, "loss": 0.0155, "step": 1365 }, { "epoch": 0.3895067008839464, "grad_norm": 0.844577177735389, "learning_rate": 6.978828010189903e-06, "loss": 0.0129, "step": 1366 }, { "epoch": 0.38979184488166524, "grad_norm": 0.818598787369236, "learning_rate": 6.974585639776757e-06, "loss": 0.0155, "step": 1367 }, { "epoch": 0.3900769888793841, "grad_norm": 0.6332227156256431, "learning_rate": 6.970341584509025e-06, "loss": 0.0143, "step": 1368 }, { "epoch": 0.3903621328771029, "grad_norm": 1.3864888311347223, "learning_rate": 6.966095848008028e-06, "loss": 0.0232, "step": 1369 }, { "epoch": 0.39064727687482176, "grad_norm": 0.807302899981098, "learning_rate": 6.9618484338965274e-06, "loss": 0.0136, "step": 1370 }, { "epoch": 0.39093242087254065, "grad_norm": 0.7532233628664813, "learning_rate": 6.957599345798714e-06, "loss": 0.0121, "step": 1371 }, { "epoch": 0.3912175648702595, "grad_norm": 0.8057671659634474, "learning_rate": 6.953348587340205e-06, "loss": 0.0078, "step": 1372 }, { "epoch": 0.39150270886797833, "grad_norm": 1.6056173303746508, "learning_rate": 6.949096162148048e-06, "loss": 0.0255, "step": 1373 }, { "epoch": 0.3917878528656972, "grad_norm": 0.921091363738595, "learning_rate": 6.944842073850709e-06, "loss": 0.0411, "step": 1374 }, { "epoch": 0.392072996863416, "grad_norm": 1.0604866928198136, "learning_rate": 6.94058632607807e-06, "loss": 0.0129, "step": 1375 }, { "epoch": 0.39235814086113485, "grad_norm": 1.0589721417569484, "learning_rate": 6.9363289224614395e-06, "loss": 0.0358, "step": 1376 }, { "epoch": 0.39264328485885375, "grad_norm": 0.7464112469258565, "learning_rate": 6.932069866633524e-06, "loss": 0.015, "step": 1377 }, { "epoch": 0.3929284288565726, "grad_norm": 0.911581766103546, "learning_rate": 6.927809162228456e-06, "loss": 0.0067, "step": 1378 }, { "epoch": 0.3932135728542914, "grad_norm": 1.9277059427524286, "learning_rate": 6.923546812881759e-06, "loss": 0.0528, "step": 1379 }, { "epoch": 0.39349871685201027, "grad_norm": 1.1879086450728118, "learning_rate": 6.919282822230372e-06, "loss": 0.0475, "step": 1380 }, { "epoch": 0.3937838608497291, "grad_norm": 1.0922499341160856, "learning_rate": 6.91501719391263e-06, "loss": 0.0235, "step": 1381 }, { "epoch": 0.39406900484744795, "grad_norm": 1.0937840797322815, "learning_rate": 6.910749931568265e-06, "loss": 0.0235, "step": 1382 }, { "epoch": 0.3943541488451668, "grad_norm": 0.7304053983961637, "learning_rate": 6.906481038838401e-06, "loss": 0.0258, "step": 1383 }, { "epoch": 0.3946392928428857, "grad_norm": 0.937712719820358, "learning_rate": 6.902210519365561e-06, "loss": 0.0246, "step": 1384 }, { "epoch": 0.3949244368406045, "grad_norm": 1.4809251423169405, "learning_rate": 6.897938376793646e-06, "loss": 0.0474, "step": 1385 }, { "epoch": 0.39520958083832336, "grad_norm": 0.6276884351320375, "learning_rate": 6.89366461476795e-06, "loss": 0.0167, "step": 1386 }, { "epoch": 0.3954947248360422, "grad_norm": 1.567668460942998, "learning_rate": 6.889389236935145e-06, "loss": 0.0301, "step": 1387 }, { "epoch": 0.39577986883376104, "grad_norm": 1.7706303043509528, "learning_rate": 6.885112246943282e-06, "loss": 0.0306, "step": 1388 }, { "epoch": 0.3960650128314799, "grad_norm": 1.4185752918637853, "learning_rate": 6.880833648441788e-06, "loss": 0.0288, "step": 1389 }, { "epoch": 0.3963501568291987, "grad_norm": 1.8432413219321235, "learning_rate": 6.876553445081463e-06, "loss": 0.0333, "step": 1390 }, { "epoch": 0.3966353008269176, "grad_norm": 0.7805065400041529, "learning_rate": 6.872271640514475e-06, "loss": 0.0211, "step": 1391 }, { "epoch": 0.39692044482463645, "grad_norm": 1.4299986011501835, "learning_rate": 6.867988238394361e-06, "loss": 0.018, "step": 1392 }, { "epoch": 0.3972055888223553, "grad_norm": 0.8738977526847553, "learning_rate": 6.863703242376016e-06, "loss": 0.0213, "step": 1393 }, { "epoch": 0.39749073282007413, "grad_norm": 1.883407055792336, "learning_rate": 6.859416656115702e-06, "loss": 0.0263, "step": 1394 }, { "epoch": 0.397775876817793, "grad_norm": 0.42498552122724303, "learning_rate": 6.855128483271033e-06, "loss": 0.0159, "step": 1395 }, { "epoch": 0.3980610208155118, "grad_norm": 0.6640145903003858, "learning_rate": 6.850838727500977e-06, "loss": 0.0241, "step": 1396 }, { "epoch": 0.3983461648132307, "grad_norm": 1.7915114587711756, "learning_rate": 6.846547392465854e-06, "loss": 0.0256, "step": 1397 }, { "epoch": 0.39863130881094955, "grad_norm": 2.294360606894688, "learning_rate": 6.8422544818273336e-06, "loss": 0.0479, "step": 1398 }, { "epoch": 0.3989164528086684, "grad_norm": 0.8371789914473499, "learning_rate": 6.837959999248423e-06, "loss": 0.019, "step": 1399 }, { "epoch": 0.3992015968063872, "grad_norm": 0.7596077582888351, "learning_rate": 6.833663948393479e-06, "loss": 0.0183, "step": 1400 }, { "epoch": 0.39948674080410607, "grad_norm": 1.3500494545169412, "learning_rate": 6.829366332928191e-06, "loss": 0.0322, "step": 1401 }, { "epoch": 0.3997718848018249, "grad_norm": 0.5127260480269076, "learning_rate": 6.825067156519584e-06, "loss": 0.0158, "step": 1402 }, { "epoch": 0.40005702879954375, "grad_norm": 0.6087593590841202, "learning_rate": 6.82076642283602e-06, "loss": 0.01, "step": 1403 }, { "epoch": 0.40034217279726264, "grad_norm": 1.174996634641959, "learning_rate": 6.816464135547183e-06, "loss": 0.0217, "step": 1404 }, { "epoch": 0.4006273167949815, "grad_norm": 0.8541388840281969, "learning_rate": 6.812160298324083e-06, "loss": 0.0153, "step": 1405 }, { "epoch": 0.4009124607927003, "grad_norm": 1.0410711202292575, "learning_rate": 6.807854914839061e-06, "loss": 0.0274, "step": 1406 }, { "epoch": 0.40119760479041916, "grad_norm": 1.2608003422751877, "learning_rate": 6.803547988765765e-06, "loss": 0.0348, "step": 1407 }, { "epoch": 0.401482748788138, "grad_norm": 0.8195812427271472, "learning_rate": 6.799239523779171e-06, "loss": 0.0203, "step": 1408 }, { "epoch": 0.40176789278585684, "grad_norm": 0.6649371018762259, "learning_rate": 6.794929523555559e-06, "loss": 0.0141, "step": 1409 }, { "epoch": 0.4020530367835757, "grad_norm": 0.38933100823711925, "learning_rate": 6.790617991772525e-06, "loss": 0.0063, "step": 1410 }, { "epoch": 0.4023381807812946, "grad_norm": 1.8663796814203395, "learning_rate": 6.786304932108967e-06, "loss": 0.0551, "step": 1411 }, { "epoch": 0.4026233247790134, "grad_norm": 2.198862423064739, "learning_rate": 6.781990348245092e-06, "loss": 0.0528, "step": 1412 }, { "epoch": 0.40290846877673225, "grad_norm": 1.5778666491396456, "learning_rate": 6.7776742438624026e-06, "loss": 0.0272, "step": 1413 }, { "epoch": 0.4031936127744511, "grad_norm": 1.082839151667033, "learning_rate": 6.773356622643703e-06, "loss": 0.0173, "step": 1414 }, { "epoch": 0.40347875677216993, "grad_norm": 1.2816980567733227, "learning_rate": 6.769037488273087e-06, "loss": 0.0382, "step": 1415 }, { "epoch": 0.4037639007698888, "grad_norm": 1.6873641138117723, "learning_rate": 6.764716844435944e-06, "loss": 0.0343, "step": 1416 }, { "epoch": 0.40404904476760767, "grad_norm": 1.5688848234236163, "learning_rate": 6.760394694818949e-06, "loss": 0.0323, "step": 1417 }, { "epoch": 0.4043341887653265, "grad_norm": 0.613312664131555, "learning_rate": 6.756071043110064e-06, "loss": 0.0163, "step": 1418 }, { "epoch": 0.40461933276304535, "grad_norm": 1.0497026541574568, "learning_rate": 6.751745892998527e-06, "loss": 0.0308, "step": 1419 }, { "epoch": 0.4049044767607642, "grad_norm": 1.6397322996436774, "learning_rate": 6.747419248174864e-06, "loss": 0.0263, "step": 1420 }, { "epoch": 0.405189620758483, "grad_norm": 0.718022101787763, "learning_rate": 6.743091112330866e-06, "loss": 0.0115, "step": 1421 }, { "epoch": 0.40547476475620187, "grad_norm": 0.8207958542231343, "learning_rate": 6.738761489159604e-06, "loss": 0.0319, "step": 1422 }, { "epoch": 0.4057599087539207, "grad_norm": 1.574000171294008, "learning_rate": 6.734430382355417e-06, "loss": 0.0293, "step": 1423 }, { "epoch": 0.4060450527516396, "grad_norm": 1.019984530943042, "learning_rate": 6.730097795613903e-06, "loss": 0.0124, "step": 1424 }, { "epoch": 0.40633019674935844, "grad_norm": 2.417307028940788, "learning_rate": 6.725763732631933e-06, "loss": 0.0384, "step": 1425 }, { "epoch": 0.4066153407470773, "grad_norm": 0.7247645218622035, "learning_rate": 6.721428197107631e-06, "loss": 0.0126, "step": 1426 }, { "epoch": 0.4069004847447961, "grad_norm": 1.48458332799653, "learning_rate": 6.717091192740378e-06, "loss": 0.0422, "step": 1427 }, { "epoch": 0.40718562874251496, "grad_norm": 2.9745577201399938, "learning_rate": 6.712752723230812e-06, "loss": 0.0758, "step": 1428 }, { "epoch": 0.4074707727402338, "grad_norm": 1.2408092239041282, "learning_rate": 6.708412792280816e-06, "loss": 0.0236, "step": 1429 }, { "epoch": 0.40775591673795264, "grad_norm": 0.505712627957945, "learning_rate": 6.704071403593524e-06, "loss": 0.0075, "step": 1430 }, { "epoch": 0.40804106073567153, "grad_norm": 0.8007241478639011, "learning_rate": 6.699728560873313e-06, "loss": 0.012, "step": 1431 }, { "epoch": 0.4083262047333904, "grad_norm": 1.22383156743963, "learning_rate": 6.695384267825799e-06, "loss": 0.022, "step": 1432 }, { "epoch": 0.4086113487311092, "grad_norm": 1.643834967167695, "learning_rate": 6.691038528157836e-06, "loss": 0.0297, "step": 1433 }, { "epoch": 0.40889649272882805, "grad_norm": 0.981130245185286, "learning_rate": 6.686691345577517e-06, "loss": 0.0311, "step": 1434 }, { "epoch": 0.4091816367265469, "grad_norm": 0.7535527214250041, "learning_rate": 6.682342723794157e-06, "loss": 0.0079, "step": 1435 }, { "epoch": 0.40946678072426573, "grad_norm": 1.4979534384423328, "learning_rate": 6.67799266651831e-06, "loss": 0.0286, "step": 1436 }, { "epoch": 0.40975192472198463, "grad_norm": 2.7428696820621137, "learning_rate": 6.673641177461743e-06, "loss": 0.0526, "step": 1437 }, { "epoch": 0.41003706871970347, "grad_norm": 3.078770079875373, "learning_rate": 6.669288260337455e-06, "loss": 0.0906, "step": 1438 }, { "epoch": 0.4103222127174223, "grad_norm": 1.6189746261305773, "learning_rate": 6.66493391885966e-06, "loss": 0.0318, "step": 1439 }, { "epoch": 0.41060735671514115, "grad_norm": 1.2290982178336693, "learning_rate": 6.660578156743782e-06, "loss": 0.0205, "step": 1440 }, { "epoch": 0.41089250071286, "grad_norm": 0.9641872975294026, "learning_rate": 6.656220977706465e-06, "loss": 0.0189, "step": 1441 }, { "epoch": 0.4111776447105788, "grad_norm": 1.1312515292408134, "learning_rate": 6.6518623854655615e-06, "loss": 0.0288, "step": 1442 }, { "epoch": 0.41146278870829767, "grad_norm": 0.8164142521434059, "learning_rate": 6.647502383740123e-06, "loss": 0.019, "step": 1443 }, { "epoch": 0.41174793270601656, "grad_norm": 2.1749643294622114, "learning_rate": 6.64314097625041e-06, "loss": 0.0326, "step": 1444 }, { "epoch": 0.4120330767037354, "grad_norm": 0.6688149744527794, "learning_rate": 6.638778166717879e-06, "loss": 0.0117, "step": 1445 }, { "epoch": 0.41231822070145424, "grad_norm": 0.9144749242956802, "learning_rate": 6.634413958865187e-06, "loss": 0.0292, "step": 1446 }, { "epoch": 0.4126033646991731, "grad_norm": 2.0650177025758447, "learning_rate": 6.6300483564161775e-06, "loss": 0.0408, "step": 1447 }, { "epoch": 0.4128885086968919, "grad_norm": 1.5032365198506417, "learning_rate": 6.625681363095892e-06, "loss": 0.0305, "step": 1448 }, { "epoch": 0.41317365269461076, "grad_norm": 1.2503228614033306, "learning_rate": 6.621312982630551e-06, "loss": 0.0214, "step": 1449 }, { "epoch": 0.4134587966923296, "grad_norm": 1.5579491899758215, "learning_rate": 6.616943218747566e-06, "loss": 0.0208, "step": 1450 }, { "epoch": 0.4137439406900485, "grad_norm": 0.5046986151483627, "learning_rate": 6.612572075175521e-06, "loss": 0.0077, "step": 1451 }, { "epoch": 0.41402908468776733, "grad_norm": 1.3996170926428428, "learning_rate": 6.608199555644186e-06, "loss": 0.0335, "step": 1452 }, { "epoch": 0.4143142286854862, "grad_norm": 1.8024425394477386, "learning_rate": 6.603825663884497e-06, "loss": 0.0391, "step": 1453 }, { "epoch": 0.414599372683205, "grad_norm": 1.4905431440579864, "learning_rate": 6.599450403628564e-06, "loss": 0.0417, "step": 1454 }, { "epoch": 0.41488451668092385, "grad_norm": 1.874227242654859, "learning_rate": 6.59507377860967e-06, "loss": 0.06, "step": 1455 }, { "epoch": 0.4151696606786427, "grad_norm": 0.6546908200537064, "learning_rate": 6.590695792562251e-06, "loss": 0.0129, "step": 1456 }, { "epoch": 0.4154548046763616, "grad_norm": 1.2440886365948998, "learning_rate": 6.5863164492219165e-06, "loss": 0.0125, "step": 1457 }, { "epoch": 0.41573994867408043, "grad_norm": 0.9649143457410724, "learning_rate": 6.5819357523254255e-06, "loss": 0.0215, "step": 1458 }, { "epoch": 0.41602509267179927, "grad_norm": 0.9042777652710828, "learning_rate": 6.577553705610695e-06, "loss": 0.0156, "step": 1459 }, { "epoch": 0.4163102366695181, "grad_norm": 1.5328371548614659, "learning_rate": 6.573170312816797e-06, "loss": 0.0389, "step": 1460 }, { "epoch": 0.41659538066723695, "grad_norm": 1.7516837901407996, "learning_rate": 6.568785577683945e-06, "loss": 0.0282, "step": 1461 }, { "epoch": 0.4168805246649558, "grad_norm": 0.7279340731763488, "learning_rate": 6.564399503953502e-06, "loss": 0.0082, "step": 1462 }, { "epoch": 0.4171656686626746, "grad_norm": 1.0423109263190555, "learning_rate": 6.560012095367976e-06, "loss": 0.0256, "step": 1463 }, { "epoch": 0.4174508126603935, "grad_norm": 0.8846001166727955, "learning_rate": 6.555623355671008e-06, "loss": 0.0184, "step": 1464 }, { "epoch": 0.41773595665811236, "grad_norm": 0.8616631964720626, "learning_rate": 6.551233288607378e-06, "loss": 0.0269, "step": 1465 }, { "epoch": 0.4180211006558312, "grad_norm": 0.6610304293410116, "learning_rate": 6.5468418979229995e-06, "loss": 0.011, "step": 1466 }, { "epoch": 0.41830624465355004, "grad_norm": 2.0231953278181263, "learning_rate": 6.542449187364913e-06, "loss": 0.0481, "step": 1467 }, { "epoch": 0.4185913886512689, "grad_norm": 0.7272833402001642, "learning_rate": 6.538055160681288e-06, "loss": 0.0129, "step": 1468 }, { "epoch": 0.4188765326489877, "grad_norm": 1.4865399155591759, "learning_rate": 6.533659821621414e-06, "loss": 0.0188, "step": 1469 }, { "epoch": 0.41916167664670656, "grad_norm": 0.9858783676809323, "learning_rate": 6.529263173935703e-06, "loss": 0.0218, "step": 1470 }, { "epoch": 0.41944682064442546, "grad_norm": 0.9797679454187829, "learning_rate": 6.524865221375681e-06, "loss": 0.0236, "step": 1471 }, { "epoch": 0.4197319646421443, "grad_norm": 0.6172684716843654, "learning_rate": 6.52046596769399e-06, "loss": 0.009, "step": 1472 }, { "epoch": 0.42001710863986313, "grad_norm": 1.7614776588873011, "learning_rate": 6.51606541664438e-06, "loss": 0.0498, "step": 1473 }, { "epoch": 0.420302252637582, "grad_norm": 0.5970946705806073, "learning_rate": 6.511663571981708e-06, "loss": 0.0085, "step": 1474 }, { "epoch": 0.4205873966353008, "grad_norm": 0.8599355447218311, "learning_rate": 6.507260437461939e-06, "loss": 0.0214, "step": 1475 }, { "epoch": 0.42087254063301965, "grad_norm": 1.0987421015708239, "learning_rate": 6.502856016842132e-06, "loss": 0.0185, "step": 1476 }, { "epoch": 0.42115768463073855, "grad_norm": 0.44275052835685647, "learning_rate": 6.498450313880449e-06, "loss": 0.0062, "step": 1477 }, { "epoch": 0.4214428286284574, "grad_norm": 1.813940453890275, "learning_rate": 6.4940433323361425e-06, "loss": 0.0287, "step": 1478 }, { "epoch": 0.42172797262617623, "grad_norm": 1.140392945104892, "learning_rate": 6.489635075969558e-06, "loss": 0.0305, "step": 1479 }, { "epoch": 0.42201311662389507, "grad_norm": 1.8157436853386422, "learning_rate": 6.485225548542129e-06, "loss": 0.0561, "step": 1480 }, { "epoch": 0.4222982606216139, "grad_norm": 2.1226592153844464, "learning_rate": 6.4808147538163715e-06, "loss": 0.0298, "step": 1481 }, { "epoch": 0.42258340461933275, "grad_norm": 0.20001428446574743, "learning_rate": 6.476402695555884e-06, "loss": 0.005, "step": 1482 }, { "epoch": 0.4228685486170516, "grad_norm": 2.5663295718601873, "learning_rate": 6.471989377525344e-06, "loss": 0.0587, "step": 1483 }, { "epoch": 0.4231536926147705, "grad_norm": 1.0554595923300076, "learning_rate": 6.467574803490504e-06, "loss": 0.0356, "step": 1484 }, { "epoch": 0.4234388366124893, "grad_norm": 1.0690511762758468, "learning_rate": 6.463158977218185e-06, "loss": 0.0199, "step": 1485 }, { "epoch": 0.42372398061020816, "grad_norm": 1.1410017307638216, "learning_rate": 6.458741902476281e-06, "loss": 0.0301, "step": 1486 }, { "epoch": 0.424009124607927, "grad_norm": 2.2153038532557616, "learning_rate": 6.454323583033748e-06, "loss": 0.036, "step": 1487 }, { "epoch": 0.42429426860564584, "grad_norm": 1.1095267953641579, "learning_rate": 6.449904022660604e-06, "loss": 0.0212, "step": 1488 }, { "epoch": 0.4245794126033647, "grad_norm": 0.8268000886152754, "learning_rate": 6.445483225127931e-06, "loss": 0.029, "step": 1489 }, { "epoch": 0.4248645566010835, "grad_norm": 0.5393177997173784, "learning_rate": 6.441061194207858e-06, "loss": 0.0102, "step": 1490 }, { "epoch": 0.4251497005988024, "grad_norm": 1.8543252777041668, "learning_rate": 6.436637933673575e-06, "loss": 0.0242, "step": 1491 }, { "epoch": 0.42543484459652126, "grad_norm": 1.63129603581456, "learning_rate": 6.4322134472993145e-06, "loss": 0.0428, "step": 1492 }, { "epoch": 0.4257199885942401, "grad_norm": 1.6335340263445486, "learning_rate": 6.42778773886036e-06, "loss": 0.0353, "step": 1493 }, { "epoch": 0.42600513259195893, "grad_norm": 1.5007279654260655, "learning_rate": 6.423360812133034e-06, "loss": 0.0449, "step": 1494 }, { "epoch": 0.4262902765896778, "grad_norm": 0.34770534993641233, "learning_rate": 6.4189326708946995e-06, "loss": 0.0123, "step": 1495 }, { "epoch": 0.4265754205873966, "grad_norm": 0.5863596956438394, "learning_rate": 6.414503318923757e-06, "loss": 0.0157, "step": 1496 }, { "epoch": 0.4268605645851155, "grad_norm": 0.476093826497124, "learning_rate": 6.410072759999643e-06, "loss": 0.0125, "step": 1497 }, { "epoch": 0.42714570858283435, "grad_norm": 1.7901802911684266, "learning_rate": 6.405640997902813e-06, "loss": 0.0329, "step": 1498 }, { "epoch": 0.4274308525805532, "grad_norm": 0.5017338212552557, "learning_rate": 6.401208036414762e-06, "loss": 0.0056, "step": 1499 }, { "epoch": 0.42771599657827203, "grad_norm": 2.0718273104929588, "learning_rate": 6.396773879318001e-06, "loss": 0.0372, "step": 1500 }, { "epoch": 0.42800114057599087, "grad_norm": 1.55254709417258, "learning_rate": 6.392338530396065e-06, "loss": 0.0098, "step": 1501 }, { "epoch": 0.4282862845737097, "grad_norm": 0.8803879294604957, "learning_rate": 6.387901993433501e-06, "loss": 0.0137, "step": 1502 }, { "epoch": 0.42857142857142855, "grad_norm": 0.9428880424761917, "learning_rate": 6.383464272215874e-06, "loss": 0.0186, "step": 1503 }, { "epoch": 0.42885657256914744, "grad_norm": 1.2850260719004754, "learning_rate": 6.379025370529755e-06, "loss": 0.0149, "step": 1504 }, { "epoch": 0.4291417165668663, "grad_norm": 0.4867048785280101, "learning_rate": 6.374585292162732e-06, "loss": 0.0168, "step": 1505 }, { "epoch": 0.4294268605645851, "grad_norm": 0.6247035768828255, "learning_rate": 6.370144040903385e-06, "loss": 0.0145, "step": 1506 }, { "epoch": 0.42971200456230396, "grad_norm": 0.3953757300162384, "learning_rate": 6.365701620541304e-06, "loss": 0.0056, "step": 1507 }, { "epoch": 0.4299971485600228, "grad_norm": 0.6458629274018028, "learning_rate": 6.361258034867071e-06, "loss": 0.0204, "step": 1508 }, { "epoch": 0.43028229255774164, "grad_norm": 1.7431227583997453, "learning_rate": 6.356813287672262e-06, "loss": 0.0472, "step": 1509 }, { "epoch": 0.4305674365554605, "grad_norm": 1.3899155935216172, "learning_rate": 6.352367382749448e-06, "loss": 0.0242, "step": 1510 }, { "epoch": 0.4308525805531794, "grad_norm": 1.2294226877678534, "learning_rate": 6.347920323892189e-06, "loss": 0.0272, "step": 1511 }, { "epoch": 0.4311377245508982, "grad_norm": 1.1560872748885018, "learning_rate": 6.343472114895022e-06, "loss": 0.0314, "step": 1512 }, { "epoch": 0.43142286854861706, "grad_norm": 0.2978850832967228, "learning_rate": 6.339022759553474e-06, "loss": 0.0098, "step": 1513 }, { "epoch": 0.4317080125463359, "grad_norm": 1.1481971058772948, "learning_rate": 6.334572261664041e-06, "loss": 0.0289, "step": 1514 }, { "epoch": 0.43199315654405473, "grad_norm": 2.2882732185225914, "learning_rate": 6.330120625024204e-06, "loss": 0.0508, "step": 1515 }, { "epoch": 0.4322783005417736, "grad_norm": 0.80064981962937, "learning_rate": 6.32566785343241e-06, "loss": 0.0124, "step": 1516 }, { "epoch": 0.43256344453949247, "grad_norm": 1.5961426275986392, "learning_rate": 6.321213950688073e-06, "loss": 0.0304, "step": 1517 }, { "epoch": 0.4328485885372113, "grad_norm": 1.2598701107379529, "learning_rate": 6.316758920591576e-06, "loss": 0.0249, "step": 1518 }, { "epoch": 0.43313373253493015, "grad_norm": 1.8114852964203785, "learning_rate": 6.312302766944263e-06, "loss": 0.0433, "step": 1519 }, { "epoch": 0.433418876532649, "grad_norm": 2.1634382274709756, "learning_rate": 6.307845493548433e-06, "loss": 0.0384, "step": 1520 }, { "epoch": 0.43370402053036783, "grad_norm": 1.6679111444495003, "learning_rate": 6.303387104207347e-06, "loss": 0.0361, "step": 1521 }, { "epoch": 0.43398916452808667, "grad_norm": 1.0012504241438478, "learning_rate": 6.2989276027252134e-06, "loss": 0.0218, "step": 1522 }, { "epoch": 0.4342743085258055, "grad_norm": 1.2371276572473315, "learning_rate": 6.2944669929071885e-06, "loss": 0.0268, "step": 1523 }, { "epoch": 0.4345594525235244, "grad_norm": 0.5710678689888669, "learning_rate": 6.29000527855938e-06, "loss": 0.0115, "step": 1524 }, { "epoch": 0.43484459652124324, "grad_norm": 1.0062812882252696, "learning_rate": 6.285542463488834e-06, "loss": 0.0141, "step": 1525 }, { "epoch": 0.4351297405189621, "grad_norm": 1.2851888569327077, "learning_rate": 6.2810785515035345e-06, "loss": 0.027, "step": 1526 }, { "epoch": 0.4354148845166809, "grad_norm": 1.4671361717954792, "learning_rate": 6.276613546412406e-06, "loss": 0.0416, "step": 1527 }, { "epoch": 0.43570002851439976, "grad_norm": 1.224206846786883, "learning_rate": 6.2721474520253e-06, "loss": 0.0195, "step": 1528 }, { "epoch": 0.4359851725121186, "grad_norm": 1.9344425177803088, "learning_rate": 6.2676802721530035e-06, "loss": 0.0291, "step": 1529 }, { "epoch": 0.43627031650983744, "grad_norm": 0.34256835211154624, "learning_rate": 6.263212010607226e-06, "loss": 0.0074, "step": 1530 }, { "epoch": 0.43655546050755634, "grad_norm": 1.4291677082822674, "learning_rate": 6.2587426712006005e-06, "loss": 0.0313, "step": 1531 }, { "epoch": 0.4368406045052752, "grad_norm": 1.1108722988310464, "learning_rate": 6.254272257746678e-06, "loss": 0.0235, "step": 1532 }, { "epoch": 0.437125748502994, "grad_norm": 0.876472008383419, "learning_rate": 6.249800774059934e-06, "loss": 0.0121, "step": 1533 }, { "epoch": 0.43741089250071286, "grad_norm": 1.5645424818809175, "learning_rate": 6.245328223955744e-06, "loss": 0.0377, "step": 1534 }, { "epoch": 0.4376960364984317, "grad_norm": 2.1968349193676375, "learning_rate": 6.240854611250406e-06, "loss": 0.0447, "step": 1535 }, { "epoch": 0.43798118049615054, "grad_norm": 1.6968143052217195, "learning_rate": 6.236379939761117e-06, "loss": 0.0398, "step": 1536 }, { "epoch": 0.43826632449386943, "grad_norm": 1.7686078679002104, "learning_rate": 6.231904213305979e-06, "loss": 0.0277, "step": 1537 }, { "epoch": 0.43855146849158827, "grad_norm": 2.1556753958526764, "learning_rate": 6.227427435703997e-06, "loss": 0.0453, "step": 1538 }, { "epoch": 0.4388366124893071, "grad_norm": 1.5287737335788831, "learning_rate": 6.2229496107750685e-06, "loss": 0.0547, "step": 1539 }, { "epoch": 0.43912175648702595, "grad_norm": 1.9309123246484545, "learning_rate": 6.218470742339987e-06, "loss": 0.0245, "step": 1540 }, { "epoch": 0.4394069004847448, "grad_norm": 1.1195972239606753, "learning_rate": 6.21399083422044e-06, "loss": 0.0373, "step": 1541 }, { "epoch": 0.43969204448246363, "grad_norm": 0.8200635359851965, "learning_rate": 6.2095098902389926e-06, "loss": 0.0108, "step": 1542 }, { "epoch": 0.43997718848018247, "grad_norm": 1.633335602846522, "learning_rate": 6.205027914219105e-06, "loss": 0.0248, "step": 1543 }, { "epoch": 0.44026233247790136, "grad_norm": 0.9579796630214975, "learning_rate": 6.20054490998511e-06, "loss": 0.0135, "step": 1544 }, { "epoch": 0.4405474764756202, "grad_norm": 0.9207521721967611, "learning_rate": 6.1960608813622215e-06, "loss": 0.0276, "step": 1545 }, { "epoch": 0.44083262047333904, "grad_norm": 1.5832680552524554, "learning_rate": 6.191575832176524e-06, "loss": 0.0256, "step": 1546 }, { "epoch": 0.4411177644710579, "grad_norm": 1.3157435085679845, "learning_rate": 6.187089766254979e-06, "loss": 0.0224, "step": 1547 }, { "epoch": 0.4414029084687767, "grad_norm": 1.026787695779842, "learning_rate": 6.182602687425407e-06, "loss": 0.0372, "step": 1548 }, { "epoch": 0.44168805246649556, "grad_norm": 0.948727292811553, "learning_rate": 6.178114599516504e-06, "loss": 0.0287, "step": 1549 }, { "epoch": 0.4419731964642144, "grad_norm": 0.6305066109325781, "learning_rate": 6.173625506357814e-06, "loss": 0.0223, "step": 1550 }, { "epoch": 0.4422583404619333, "grad_norm": 0.7164994011017372, "learning_rate": 6.169135411779749e-06, "loss": 0.0119, "step": 1551 }, { "epoch": 0.44254348445965214, "grad_norm": 0.8216759106611643, "learning_rate": 6.164644319613571e-06, "loss": 0.0271, "step": 1552 }, { "epoch": 0.442828628457371, "grad_norm": 0.8327268854217976, "learning_rate": 6.160152233691393e-06, "loss": 0.022, "step": 1553 }, { "epoch": 0.4431137724550898, "grad_norm": 1.1286241666524117, "learning_rate": 6.155659157846178e-06, "loss": 0.0303, "step": 1554 }, { "epoch": 0.44339891645280866, "grad_norm": 1.5466332840928882, "learning_rate": 6.151165095911733e-06, "loss": 0.0355, "step": 1555 }, { "epoch": 0.4436840604505275, "grad_norm": 1.164291430707095, "learning_rate": 6.1466700517227044e-06, "loss": 0.0292, "step": 1556 }, { "epoch": 0.4439692044482464, "grad_norm": 0.7237425608379466, "learning_rate": 6.142174029114579e-06, "loss": 0.0204, "step": 1557 }, { "epoch": 0.44425434844596523, "grad_norm": 1.525607471024849, "learning_rate": 6.137677031923679e-06, "loss": 0.0319, "step": 1558 }, { "epoch": 0.44453949244368407, "grad_norm": 1.5076148523093191, "learning_rate": 6.133179063987156e-06, "loss": 0.0336, "step": 1559 }, { "epoch": 0.4448246364414029, "grad_norm": 2.470324848338266, "learning_rate": 6.128680129142991e-06, "loss": 0.0173, "step": 1560 }, { "epoch": 0.44510978043912175, "grad_norm": 3.040160807782214, "learning_rate": 6.1241802312299895e-06, "loss": 0.0709, "step": 1561 }, { "epoch": 0.4453949244368406, "grad_norm": 1.0375626585244984, "learning_rate": 6.119679374087778e-06, "loss": 0.027, "step": 1562 }, { "epoch": 0.44568006843455943, "grad_norm": 1.3814735291614966, "learning_rate": 6.115177561556806e-06, "loss": 0.0252, "step": 1563 }, { "epoch": 0.4459652124322783, "grad_norm": 1.2628722116063182, "learning_rate": 6.110674797478332e-06, "loss": 0.0215, "step": 1564 }, { "epoch": 0.44625035642999716, "grad_norm": 1.0677934713873056, "learning_rate": 6.10617108569443e-06, "loss": 0.0293, "step": 1565 }, { "epoch": 0.446535500427716, "grad_norm": 1.3607904858833422, "learning_rate": 6.1016664300479835e-06, "loss": 0.0382, "step": 1566 }, { "epoch": 0.44682064442543484, "grad_norm": 2.4973158350192355, "learning_rate": 6.097160834382678e-06, "loss": 0.0577, "step": 1567 }, { "epoch": 0.4471057884231537, "grad_norm": 1.4543142327687228, "learning_rate": 6.092654302543002e-06, "loss": 0.0569, "step": 1568 }, { "epoch": 0.4473909324208725, "grad_norm": 1.547257628130523, "learning_rate": 6.088146838374247e-06, "loss": 0.0202, "step": 1569 }, { "epoch": 0.44767607641859136, "grad_norm": 1.1158703535313215, "learning_rate": 6.083638445722493e-06, "loss": 0.0192, "step": 1570 }, { "epoch": 0.44796122041631026, "grad_norm": 0.9709682394236485, "learning_rate": 6.079129128434619e-06, "loss": 0.031, "step": 1571 }, { "epoch": 0.4482463644140291, "grad_norm": 1.6318763828200638, "learning_rate": 6.074618890358287e-06, "loss": 0.0184, "step": 1572 }, { "epoch": 0.44853150841174794, "grad_norm": 1.8459498966727417, "learning_rate": 6.07010773534195e-06, "loss": 0.039, "step": 1573 }, { "epoch": 0.4488166524094668, "grad_norm": 1.2404446802074922, "learning_rate": 6.06559566723484e-06, "loss": 0.0182, "step": 1574 }, { "epoch": 0.4491017964071856, "grad_norm": 1.4442916601321651, "learning_rate": 6.06108268988697e-06, "loss": 0.0401, "step": 1575 }, { "epoch": 0.44938694040490446, "grad_norm": 0.9320753790810795, "learning_rate": 6.056568807149127e-06, "loss": 0.0239, "step": 1576 }, { "epoch": 0.44967208440262335, "grad_norm": 1.4017667599463124, "learning_rate": 6.052054022872873e-06, "loss": 0.023, "step": 1577 }, { "epoch": 0.4499572284003422, "grad_norm": 0.6833624466019657, "learning_rate": 6.047538340910534e-06, "loss": 0.0144, "step": 1578 }, { "epoch": 0.45024237239806103, "grad_norm": 0.6630610636763746, "learning_rate": 6.0430217651152105e-06, "loss": 0.02, "step": 1579 }, { "epoch": 0.45052751639577987, "grad_norm": 0.6627483075350021, "learning_rate": 6.0385042993407574e-06, "loss": 0.0152, "step": 1580 }, { "epoch": 0.4508126603934987, "grad_norm": 0.7801825576978031, "learning_rate": 6.033985947441795e-06, "loss": 0.0186, "step": 1581 }, { "epoch": 0.45109780439121755, "grad_norm": 0.7431161777834322, "learning_rate": 6.029466713273695e-06, "loss": 0.0115, "step": 1582 }, { "epoch": 0.4513829483889364, "grad_norm": 0.4049070036061823, "learning_rate": 6.024946600692585e-06, "loss": 0.0074, "step": 1583 }, { "epoch": 0.4516680923866553, "grad_norm": 1.0478163608067583, "learning_rate": 6.020425613555341e-06, "loss": 0.0199, "step": 1584 }, { "epoch": 0.4519532363843741, "grad_norm": 1.5885560292025787, "learning_rate": 6.015903755719588e-06, "loss": 0.0454, "step": 1585 }, { "epoch": 0.45223838038209296, "grad_norm": 1.0854421779536836, "learning_rate": 6.011381031043686e-06, "loss": 0.0294, "step": 1586 }, { "epoch": 0.4525235243798118, "grad_norm": 0.4134891652172671, "learning_rate": 6.006857443386746e-06, "loss": 0.012, "step": 1587 }, { "epoch": 0.45280866837753064, "grad_norm": 1.7921075489007599, "learning_rate": 6.002332996608605e-06, "loss": 0.0277, "step": 1588 }, { "epoch": 0.4530938123752495, "grad_norm": 0.9673035410671176, "learning_rate": 5.9978076945698395e-06, "loss": 0.017, "step": 1589 }, { "epoch": 0.4533789563729683, "grad_norm": 0.9724566217783264, "learning_rate": 5.993281541131753e-06, "loss": 0.0319, "step": 1590 }, { "epoch": 0.4536641003706872, "grad_norm": 0.7175181508761327, "learning_rate": 5.9887545401563775e-06, "loss": 0.017, "step": 1591 }, { "epoch": 0.45394924436840606, "grad_norm": 1.1816527100772103, "learning_rate": 5.984226695506464e-06, "loss": 0.0279, "step": 1592 }, { "epoch": 0.4542343883661249, "grad_norm": 1.9390267938168535, "learning_rate": 5.979698011045492e-06, "loss": 0.038, "step": 1593 }, { "epoch": 0.45451953236384374, "grad_norm": 0.9155009626135624, "learning_rate": 5.975168490637644e-06, "loss": 0.052, "step": 1594 }, { "epoch": 0.4548046763615626, "grad_norm": 0.7977284818854188, "learning_rate": 5.970638138147829e-06, "loss": 0.0107, "step": 1595 }, { "epoch": 0.4550898203592814, "grad_norm": 0.9196711480479474, "learning_rate": 5.966106957441661e-06, "loss": 0.0176, "step": 1596 }, { "epoch": 0.4553749643570003, "grad_norm": 4.36651610023969, "learning_rate": 5.961574952385457e-06, "loss": 0.0302, "step": 1597 }, { "epoch": 0.45566010835471915, "grad_norm": 2.9956145689115043, "learning_rate": 5.957042126846243e-06, "loss": 0.0496, "step": 1598 }, { "epoch": 0.455945252352438, "grad_norm": 1.318241867584188, "learning_rate": 5.952508484691742e-06, "loss": 0.0248, "step": 1599 }, { "epoch": 0.45623039635015683, "grad_norm": 1.4632248370825405, "learning_rate": 5.947974029790375e-06, "loss": 0.0288, "step": 1600 }, { "epoch": 0.45651554034787567, "grad_norm": 1.5614361952777818, "learning_rate": 5.943438766011256e-06, "loss": 0.0505, "step": 1601 }, { "epoch": 0.4568006843455945, "grad_norm": 0.5404823438930301, "learning_rate": 5.938902697224189e-06, "loss": 0.0065, "step": 1602 }, { "epoch": 0.45708582834331335, "grad_norm": 1.0138237952237483, "learning_rate": 5.934365827299666e-06, "loss": 0.0248, "step": 1603 }, { "epoch": 0.45737097234103224, "grad_norm": 1.3912521886461697, "learning_rate": 5.9298281601088616e-06, "loss": 0.0226, "step": 1604 }, { "epoch": 0.4576561163387511, "grad_norm": 1.346109081340996, "learning_rate": 5.92528969952363e-06, "loss": 0.0431, "step": 1605 }, { "epoch": 0.4579412603364699, "grad_norm": 0.6722377723333736, "learning_rate": 5.9207504494165025e-06, "loss": 0.0057, "step": 1606 }, { "epoch": 0.45822640433418876, "grad_norm": 1.5756505347726604, "learning_rate": 5.916210413660687e-06, "loss": 0.0323, "step": 1607 }, { "epoch": 0.4585115483319076, "grad_norm": 1.372238468851667, "learning_rate": 5.9116695961300584e-06, "loss": 0.0216, "step": 1608 }, { "epoch": 0.45879669232962644, "grad_norm": 0.7560326833267913, "learning_rate": 5.907128000699159e-06, "loss": 0.0141, "step": 1609 }, { "epoch": 0.4590818363273453, "grad_norm": 1.8272989751745645, "learning_rate": 5.9025856312431985e-06, "loss": 0.031, "step": 1610 }, { "epoch": 0.4593669803250642, "grad_norm": 1.515596989644, "learning_rate": 5.898042491638042e-06, "loss": 0.0302, "step": 1611 }, { "epoch": 0.459652124322783, "grad_norm": 1.3089273048621382, "learning_rate": 5.8934985857602144e-06, "loss": 0.0314, "step": 1612 }, { "epoch": 0.45993726832050186, "grad_norm": 1.0921868001336128, "learning_rate": 5.8889539174868935e-06, "loss": 0.0199, "step": 1613 }, { "epoch": 0.4602224123182207, "grad_norm": 0.9944417014298922, "learning_rate": 5.88440849069591e-06, "loss": 0.0139, "step": 1614 }, { "epoch": 0.46050755631593954, "grad_norm": 0.9965377028835146, "learning_rate": 5.879862309265738e-06, "loss": 0.0235, "step": 1615 }, { "epoch": 0.4607927003136584, "grad_norm": 1.4517072630287036, "learning_rate": 5.875315377075497e-06, "loss": 0.0417, "step": 1616 }, { "epoch": 0.46107784431137727, "grad_norm": 0.4827625546781854, "learning_rate": 5.87076769800495e-06, "loss": 0.0184, "step": 1617 }, { "epoch": 0.4613629883090961, "grad_norm": 1.9595291045089964, "learning_rate": 5.866219275934494e-06, "loss": 0.0388, "step": 1618 }, { "epoch": 0.46164813230681495, "grad_norm": 0.9604209914453493, "learning_rate": 5.86167011474516e-06, "loss": 0.0438, "step": 1619 }, { "epoch": 0.4619332763045338, "grad_norm": 1.5066448543730424, "learning_rate": 5.857120218318612e-06, "loss": 0.0418, "step": 1620 }, { "epoch": 0.46221842030225263, "grad_norm": 1.5026872619543066, "learning_rate": 5.852569590537138e-06, "loss": 0.0284, "step": 1621 }, { "epoch": 0.46250356429997147, "grad_norm": 1.0384555882506465, "learning_rate": 5.848018235283654e-06, "loss": 0.0217, "step": 1622 }, { "epoch": 0.4627887082976903, "grad_norm": 1.2663294731314405, "learning_rate": 5.843466156441693e-06, "loss": 0.0215, "step": 1623 }, { "epoch": 0.4630738522954092, "grad_norm": 1.4544791021792642, "learning_rate": 5.838913357895408e-06, "loss": 0.043, "step": 1624 }, { "epoch": 0.46335899629312804, "grad_norm": 1.9146030561749325, "learning_rate": 5.834359843529565e-06, "loss": 0.0429, "step": 1625 }, { "epoch": 0.4636441402908469, "grad_norm": 0.5614062355847471, "learning_rate": 5.82980561722954e-06, "loss": 0.0084, "step": 1626 }, { "epoch": 0.4639292842885657, "grad_norm": 1.0668880452470082, "learning_rate": 5.8252506828813195e-06, "loss": 0.0259, "step": 1627 }, { "epoch": 0.46421442828628456, "grad_norm": 0.8362296876846605, "learning_rate": 5.82069504437149e-06, "loss": 0.0219, "step": 1628 }, { "epoch": 0.4644995722840034, "grad_norm": 1.865874275154151, "learning_rate": 5.816138705587242e-06, "loss": 0.0339, "step": 1629 }, { "epoch": 0.46478471628172224, "grad_norm": 0.7831355484928149, "learning_rate": 5.811581670416363e-06, "loss": 0.011, "step": 1630 }, { "epoch": 0.46506986027944114, "grad_norm": 0.9660875848485134, "learning_rate": 5.807023942747232e-06, "loss": 0.0145, "step": 1631 }, { "epoch": 0.46535500427716, "grad_norm": 0.6626505988518036, "learning_rate": 5.802465526468825e-06, "loss": 0.0171, "step": 1632 }, { "epoch": 0.4656401482748788, "grad_norm": 1.8732921332420605, "learning_rate": 5.797906425470696e-06, "loss": 0.0412, "step": 1633 }, { "epoch": 0.46592529227259766, "grad_norm": 0.9110735893383872, "learning_rate": 5.7933466436429945e-06, "loss": 0.0196, "step": 1634 }, { "epoch": 0.4662104362703165, "grad_norm": 0.9187090445005635, "learning_rate": 5.788786184876443e-06, "loss": 0.0086, "step": 1635 }, { "epoch": 0.46649558026803534, "grad_norm": 0.4577963271928218, "learning_rate": 5.784225053062342e-06, "loss": 0.0172, "step": 1636 }, { "epoch": 0.46678072426575423, "grad_norm": 0.6514140579973696, "learning_rate": 5.77966325209257e-06, "loss": 0.0185, "step": 1637 }, { "epoch": 0.46706586826347307, "grad_norm": 1.5558388744790788, "learning_rate": 5.775100785859576e-06, "loss": 0.0379, "step": 1638 }, { "epoch": 0.4673510122611919, "grad_norm": 1.3399688016409907, "learning_rate": 5.770537658256373e-06, "loss": 0.0327, "step": 1639 }, { "epoch": 0.46763615625891075, "grad_norm": 1.1869080276664048, "learning_rate": 5.765973873176544e-06, "loss": 0.0194, "step": 1640 }, { "epoch": 0.4679213002566296, "grad_norm": 1.2920295998953273, "learning_rate": 5.761409434514223e-06, "loss": 0.0158, "step": 1641 }, { "epoch": 0.46820644425434843, "grad_norm": 0.5899684738930414, "learning_rate": 5.756844346164115e-06, "loss": 0.0119, "step": 1642 }, { "epoch": 0.46849158825206727, "grad_norm": 0.7664070156217964, "learning_rate": 5.752278612021471e-06, "loss": 0.0178, "step": 1643 }, { "epoch": 0.46877673224978617, "grad_norm": 1.0535022309389133, "learning_rate": 5.747712235982094e-06, "loss": 0.0217, "step": 1644 }, { "epoch": 0.469061876247505, "grad_norm": 0.5765641658694458, "learning_rate": 5.743145221942333e-06, "loss": 0.0088, "step": 1645 }, { "epoch": 0.46934702024522384, "grad_norm": 1.220409106870033, "learning_rate": 5.73857757379909e-06, "loss": 0.0312, "step": 1646 }, { "epoch": 0.4696321642429427, "grad_norm": 1.1369091506463587, "learning_rate": 5.7340092954497965e-06, "loss": 0.0241, "step": 1647 }, { "epoch": 0.4699173082406615, "grad_norm": 1.0456981783807946, "learning_rate": 5.729440390792429e-06, "loss": 0.0241, "step": 1648 }, { "epoch": 0.47020245223838036, "grad_norm": 2.3070988825991146, "learning_rate": 5.724870863725497e-06, "loss": 0.0403, "step": 1649 }, { "epoch": 0.4704875962360992, "grad_norm": 1.3376794149494875, "learning_rate": 5.72030071814804e-06, "loss": 0.0313, "step": 1650 }, { "epoch": 0.4707727402338181, "grad_norm": 0.9046821280854266, "learning_rate": 5.715729957959625e-06, "loss": 0.0101, "step": 1651 }, { "epoch": 0.47105788423153694, "grad_norm": 1.0675955765788645, "learning_rate": 5.7111585870603455e-06, "loss": 0.0191, "step": 1652 }, { "epoch": 0.4713430282292558, "grad_norm": 0.5649793938230241, "learning_rate": 5.7065866093508145e-06, "loss": 0.0112, "step": 1653 }, { "epoch": 0.4716281722269746, "grad_norm": 1.3714092185991984, "learning_rate": 5.7020140287321645e-06, "loss": 0.0291, "step": 1654 }, { "epoch": 0.47191331622469346, "grad_norm": 0.5776238056475193, "learning_rate": 5.697440849106039e-06, "loss": 0.0129, "step": 1655 }, { "epoch": 0.4721984602224123, "grad_norm": 0.9710900939411276, "learning_rate": 5.692867074374596e-06, "loss": 0.0429, "step": 1656 }, { "epoch": 0.4724836042201312, "grad_norm": 0.6235588357191956, "learning_rate": 5.688292708440501e-06, "loss": 0.0092, "step": 1657 }, { "epoch": 0.47276874821785003, "grad_norm": 1.3140976352998177, "learning_rate": 5.6837177552069204e-06, "loss": 0.0238, "step": 1658 }, { "epoch": 0.47305389221556887, "grad_norm": 1.4703932405577713, "learning_rate": 5.679142218577525e-06, "loss": 0.0233, "step": 1659 }, { "epoch": 0.4733390362132877, "grad_norm": 3.059192393508655, "learning_rate": 5.6745661024564854e-06, "loss": 0.0941, "step": 1660 }, { "epoch": 0.47362418021100655, "grad_norm": 1.2480858491288185, "learning_rate": 5.66998941074846e-06, "loss": 0.0254, "step": 1661 }, { "epoch": 0.4739093242087254, "grad_norm": 1.3391566425560755, "learning_rate": 5.665412147358607e-06, "loss": 0.0192, "step": 1662 }, { "epoch": 0.47419446820644423, "grad_norm": 1.9530592193510963, "learning_rate": 5.6608343161925616e-06, "loss": 0.0348, "step": 1663 }, { "epoch": 0.4744796122041631, "grad_norm": 0.5882117331069833, "learning_rate": 5.656255921156455e-06, "loss": 0.0078, "step": 1664 }, { "epoch": 0.47476475620188197, "grad_norm": 0.4806565752566173, "learning_rate": 5.651676966156892e-06, "loss": 0.0166, "step": 1665 }, { "epoch": 0.4750499001996008, "grad_norm": 2.5545952290218805, "learning_rate": 5.6470974551009565e-06, "loss": 0.0401, "step": 1666 }, { "epoch": 0.47533504419731964, "grad_norm": 3.454081035186067, "learning_rate": 5.642517391896208e-06, "loss": 0.0461, "step": 1667 }, { "epoch": 0.4756201881950385, "grad_norm": 1.2109065735867222, "learning_rate": 5.637936780450679e-06, "loss": 0.0287, "step": 1668 }, { "epoch": 0.4759053321927573, "grad_norm": 1.1002493916885174, "learning_rate": 5.633355624672865e-06, "loss": 0.0123, "step": 1669 }, { "epoch": 0.47619047619047616, "grad_norm": 0.7138212776082841, "learning_rate": 5.628773928471729e-06, "loss": 0.008, "step": 1670 }, { "epoch": 0.47647562018819506, "grad_norm": 1.273915884803688, "learning_rate": 5.6241916957566955e-06, "loss": 0.014, "step": 1671 }, { "epoch": 0.4767607641859139, "grad_norm": 1.0112889834715977, "learning_rate": 5.619608930437647e-06, "loss": 0.0296, "step": 1672 }, { "epoch": 0.47704590818363274, "grad_norm": 2.625237325555971, "learning_rate": 5.6150256364249165e-06, "loss": 0.0315, "step": 1673 }, { "epoch": 0.4773310521813516, "grad_norm": 0.9222000915216457, "learning_rate": 5.610441817629293e-06, "loss": 0.0214, "step": 1674 }, { "epoch": 0.4776161961790704, "grad_norm": 1.2738057180224518, "learning_rate": 5.605857477962009e-06, "loss": 0.06, "step": 1675 }, { "epoch": 0.47790134017678926, "grad_norm": 1.7750846497598578, "learning_rate": 5.601272621334749e-06, "loss": 0.0754, "step": 1676 }, { "epoch": 0.47818648417450815, "grad_norm": 1.7959815648598039, "learning_rate": 5.5966872516596275e-06, "loss": 0.0617, "step": 1677 }, { "epoch": 0.478471628172227, "grad_norm": 1.1371874135314433, "learning_rate": 5.592101372849207e-06, "loss": 0.0196, "step": 1678 }, { "epoch": 0.47875677216994583, "grad_norm": 0.9732582128190234, "learning_rate": 5.587514988816477e-06, "loss": 0.0151, "step": 1679 }, { "epoch": 0.47904191616766467, "grad_norm": 1.3674954779188182, "learning_rate": 5.582928103474863e-06, "loss": 0.0298, "step": 1680 }, { "epoch": 0.4793270601653835, "grad_norm": 0.9010991025266389, "learning_rate": 5.5783407207382146e-06, "loss": 0.0208, "step": 1681 }, { "epoch": 0.47961220416310235, "grad_norm": 0.7798121219399466, "learning_rate": 5.573752844520812e-06, "loss": 0.0186, "step": 1682 }, { "epoch": 0.4798973481608212, "grad_norm": 0.8532491665099611, "learning_rate": 5.569164478737346e-06, "loss": 0.0285, "step": 1683 }, { "epoch": 0.4801824921585401, "grad_norm": 1.2652300108229633, "learning_rate": 5.564575627302936e-06, "loss": 0.0146, "step": 1684 }, { "epoch": 0.4804676361562589, "grad_norm": 1.0842547917810184, "learning_rate": 5.5599862941331084e-06, "loss": 0.0236, "step": 1685 }, { "epoch": 0.48075278015397777, "grad_norm": 0.7692465444203684, "learning_rate": 5.5553964831438044e-06, "loss": 0.0098, "step": 1686 }, { "epoch": 0.4810379241516966, "grad_norm": 0.6516988532185854, "learning_rate": 5.5508061982513715e-06, "loss": 0.0095, "step": 1687 }, { "epoch": 0.48132306814941545, "grad_norm": 0.86894709268537, "learning_rate": 5.546215443372562e-06, "loss": 0.0207, "step": 1688 }, { "epoch": 0.4816082121471343, "grad_norm": 1.1589813428803937, "learning_rate": 5.541624222424529e-06, "loss": 0.0182, "step": 1689 }, { "epoch": 0.4818933561448531, "grad_norm": 0.4753465975864277, "learning_rate": 5.537032539324824e-06, "loss": 0.0143, "step": 1690 }, { "epoch": 0.482178500142572, "grad_norm": 0.3429031884155126, "learning_rate": 5.53244039799139e-06, "loss": 0.0089, "step": 1691 }, { "epoch": 0.48246364414029086, "grad_norm": 1.9674672537859532, "learning_rate": 5.527847802342567e-06, "loss": 0.054, "step": 1692 }, { "epoch": 0.4827487881380097, "grad_norm": 0.9183324853750284, "learning_rate": 5.523254756297074e-06, "loss": 0.0332, "step": 1693 }, { "epoch": 0.48303393213572854, "grad_norm": 1.5405624693680082, "learning_rate": 5.518661263774023e-06, "loss": 0.0625, "step": 1694 }, { "epoch": 0.4833190761334474, "grad_norm": 1.4478394054980839, "learning_rate": 5.514067328692902e-06, "loss": 0.0234, "step": 1695 }, { "epoch": 0.4836042201311662, "grad_norm": 1.198470695313575, "learning_rate": 5.509472954973578e-06, "loss": 0.0339, "step": 1696 }, { "epoch": 0.4838893641288851, "grad_norm": 1.4872008148949296, "learning_rate": 5.504878146536291e-06, "loss": 0.0152, "step": 1697 }, { "epoch": 0.48417450812660395, "grad_norm": 0.9089989321086841, "learning_rate": 5.500282907301655e-06, "loss": 0.0227, "step": 1698 }, { "epoch": 0.4844596521243228, "grad_norm": 0.9108428863142347, "learning_rate": 5.495687241190646e-06, "loss": 0.0081, "step": 1699 }, { "epoch": 0.48474479612204163, "grad_norm": 1.7700228667742393, "learning_rate": 5.491091152124611e-06, "loss": 0.0397, "step": 1700 }, { "epoch": 0.48502994011976047, "grad_norm": 1.086368323032613, "learning_rate": 5.4864946440252555e-06, "loss": 0.0295, "step": 1701 }, { "epoch": 0.4853150841174793, "grad_norm": 0.8881192538969089, "learning_rate": 5.4818977208146375e-06, "loss": 0.0315, "step": 1702 }, { "epoch": 0.48560022811519815, "grad_norm": 0.8299716634196358, "learning_rate": 5.477300386415176e-06, "loss": 0.0125, "step": 1703 }, { "epoch": 0.48588537211291705, "grad_norm": 0.8778183424306557, "learning_rate": 5.47270264474964e-06, "loss": 0.0225, "step": 1704 }, { "epoch": 0.4861705161106359, "grad_norm": 0.3759935217970214, "learning_rate": 5.46810449974114e-06, "loss": 0.0067, "step": 1705 }, { "epoch": 0.4864556601083547, "grad_norm": 1.012504942599993, "learning_rate": 5.46350595531314e-06, "loss": 0.0342, "step": 1706 }, { "epoch": 0.48674080410607357, "grad_norm": 0.3155753567046455, "learning_rate": 5.458907015389435e-06, "loss": 0.0068, "step": 1707 }, { "epoch": 0.4870259481037924, "grad_norm": 0.8772082266297694, "learning_rate": 5.454307683894166e-06, "loss": 0.0137, "step": 1708 }, { "epoch": 0.48731109210151125, "grad_norm": 0.8540371209025869, "learning_rate": 5.449707964751801e-06, "loss": 0.011, "step": 1709 }, { "epoch": 0.4875962360992301, "grad_norm": 0.9996853289237507, "learning_rate": 5.445107861887144e-06, "loss": 0.0156, "step": 1710 }, { "epoch": 0.487881380096949, "grad_norm": 1.4433308389528134, "learning_rate": 5.440507379225325e-06, "loss": 0.0347, "step": 1711 }, { "epoch": 0.4881665240946678, "grad_norm": 0.6514104791176355, "learning_rate": 5.435906520691798e-06, "loss": 0.0279, "step": 1712 }, { "epoch": 0.48845166809238666, "grad_norm": 1.1622561862286143, "learning_rate": 5.431305290212335e-06, "loss": 0.0531, "step": 1713 }, { "epoch": 0.4887368120901055, "grad_norm": 1.0205243505665194, "learning_rate": 5.426703691713029e-06, "loss": 0.034, "step": 1714 }, { "epoch": 0.48902195608782434, "grad_norm": 1.5178878114736611, "learning_rate": 5.422101729120287e-06, "loss": 0.0326, "step": 1715 }, { "epoch": 0.4893071000855432, "grad_norm": 1.5210315919144914, "learning_rate": 5.417499406360823e-06, "loss": 0.0244, "step": 1716 }, { "epoch": 0.4895922440832621, "grad_norm": 0.7365109722429185, "learning_rate": 5.412896727361663e-06, "loss": 0.0109, "step": 1717 }, { "epoch": 0.4898773880809809, "grad_norm": 1.5587515861114862, "learning_rate": 5.408293696050132e-06, "loss": 0.0252, "step": 1718 }, { "epoch": 0.49016253207869975, "grad_norm": 0.8960330096591171, "learning_rate": 5.40369031635386e-06, "loss": 0.0364, "step": 1719 }, { "epoch": 0.4904476760764186, "grad_norm": 0.6398440868605254, "learning_rate": 5.399086592200774e-06, "loss": 0.0134, "step": 1720 }, { "epoch": 0.49073282007413743, "grad_norm": 1.2962219418993994, "learning_rate": 5.394482527519089e-06, "loss": 0.0417, "step": 1721 }, { "epoch": 0.49101796407185627, "grad_norm": 1.1342913204571548, "learning_rate": 5.38987812623732e-06, "loss": 0.0349, "step": 1722 }, { "epoch": 0.4913031080695751, "grad_norm": 1.415562479211778, "learning_rate": 5.385273392284262e-06, "loss": 0.0341, "step": 1723 }, { "epoch": 0.491588252067294, "grad_norm": 0.570721980159015, "learning_rate": 5.380668329588996e-06, "loss": 0.0092, "step": 1724 }, { "epoch": 0.49187339606501285, "grad_norm": 1.2011080438952686, "learning_rate": 5.376062942080883e-06, "loss": 0.0199, "step": 1725 }, { "epoch": 0.4921585400627317, "grad_norm": 0.8641679648671838, "learning_rate": 5.371457233689563e-06, "loss": 0.0344, "step": 1726 }, { "epoch": 0.4924436840604505, "grad_norm": 1.3972786914451918, "learning_rate": 5.366851208344948e-06, "loss": 0.0267, "step": 1727 }, { "epoch": 0.49272882805816937, "grad_norm": 0.9723899245443296, "learning_rate": 5.362244869977224e-06, "loss": 0.0241, "step": 1728 }, { "epoch": 0.4930139720558882, "grad_norm": 1.2779418929535273, "learning_rate": 5.357638222516833e-06, "loss": 0.0228, "step": 1729 }, { "epoch": 0.49329911605360705, "grad_norm": 1.596318162921975, "learning_rate": 5.353031269894497e-06, "loss": 0.0324, "step": 1730 }, { "epoch": 0.49358426005132594, "grad_norm": 1.066749627601742, "learning_rate": 5.348424016041188e-06, "loss": 0.02, "step": 1731 }, { "epoch": 0.4938694040490448, "grad_norm": 2.035347543339062, "learning_rate": 5.343816464888136e-06, "loss": 0.0404, "step": 1732 }, { "epoch": 0.4941545480467636, "grad_norm": 3.3356286353565414, "learning_rate": 5.339208620366827e-06, "loss": 0.0574, "step": 1733 }, { "epoch": 0.49443969204448246, "grad_norm": 1.5158864916198709, "learning_rate": 5.334600486408995e-06, "loss": 0.0407, "step": 1734 }, { "epoch": 0.4947248360422013, "grad_norm": 1.0504079623189557, "learning_rate": 5.3299920669466224e-06, "loss": 0.0199, "step": 1735 }, { "epoch": 0.49500998003992014, "grad_norm": 1.2772606321263538, "learning_rate": 5.325383365911937e-06, "loss": 0.0203, "step": 1736 }, { "epoch": 0.49529512403763903, "grad_norm": 0.7819050740242356, "learning_rate": 5.320774387237403e-06, "loss": 0.0141, "step": 1737 }, { "epoch": 0.4955802680353579, "grad_norm": 1.0278612287001418, "learning_rate": 5.316165134855724e-06, "loss": 0.0157, "step": 1738 }, { "epoch": 0.4958654120330767, "grad_norm": 1.4731255811320176, "learning_rate": 5.311555612699837e-06, "loss": 0.0289, "step": 1739 }, { "epoch": 0.49615055603079555, "grad_norm": 0.7990479306692042, "learning_rate": 5.306945824702908e-06, "loss": 0.0091, "step": 1740 }, { "epoch": 0.4964357000285144, "grad_norm": 1.3645150027377915, "learning_rate": 5.302335774798331e-06, "loss": 0.0417, "step": 1741 }, { "epoch": 0.49672084402623323, "grad_norm": 0.5097086067391874, "learning_rate": 5.297725466919722e-06, "loss": 0.0067, "step": 1742 }, { "epoch": 0.49700598802395207, "grad_norm": 1.220765771700666, "learning_rate": 5.29311490500092e-06, "loss": 0.0422, "step": 1743 }, { "epoch": 0.49729113202167097, "grad_norm": 0.6971180492227302, "learning_rate": 5.288504092975975e-06, "loss": 0.0206, "step": 1744 }, { "epoch": 0.4975762760193898, "grad_norm": 1.0855891533617872, "learning_rate": 5.2838930347791605e-06, "loss": 0.0377, "step": 1745 }, { "epoch": 0.49786142001710865, "grad_norm": 0.5153649468444501, "learning_rate": 5.279281734344951e-06, "loss": 0.0115, "step": 1746 }, { "epoch": 0.4981465640148275, "grad_norm": 0.8506337273843959, "learning_rate": 5.274670195608029e-06, "loss": 0.0127, "step": 1747 }, { "epoch": 0.4984317080125463, "grad_norm": 1.1073160831408677, "learning_rate": 5.270058422503284e-06, "loss": 0.0191, "step": 1748 }, { "epoch": 0.49871685201026517, "grad_norm": 1.7959372472589628, "learning_rate": 5.265446418965801e-06, "loss": 0.0349, "step": 1749 }, { "epoch": 0.499001996007984, "grad_norm": 1.2999018982531467, "learning_rate": 5.260834188930867e-06, "loss": 0.0144, "step": 1750 }, { "epoch": 0.4992871400057029, "grad_norm": 1.1152004670396407, "learning_rate": 5.256221736333959e-06, "loss": 0.0389, "step": 1751 }, { "epoch": 0.49957228400342174, "grad_norm": 1.194164124119914, "learning_rate": 5.25160906511074e-06, "loss": 0.0264, "step": 1752 }, { "epoch": 0.4998574280011406, "grad_norm": 1.2213677453155716, "learning_rate": 5.246996179197068e-06, "loss": 0.0245, "step": 1753 }, { "epoch": 0.5001425719988595, "grad_norm": 0.5547879059243418, "learning_rate": 5.24238308252898e-06, "loss": 0.017, "step": 1754 }, { "epoch": 0.5004277159965783, "grad_norm": 0.7060694139102462, "learning_rate": 5.23776977904269e-06, "loss": 0.0196, "step": 1755 }, { "epoch": 0.5007128599942972, "grad_norm": 0.8256575425352183, "learning_rate": 5.233156272674595e-06, "loss": 0.0109, "step": 1756 }, { "epoch": 0.500998003992016, "grad_norm": 1.2395878805797904, "learning_rate": 5.228542567361258e-06, "loss": 0.0264, "step": 1757 }, { "epoch": 0.5012831479897348, "grad_norm": 0.5321859041552769, "learning_rate": 5.223928667039414e-06, "loss": 0.0097, "step": 1758 }, { "epoch": 0.5015682919874537, "grad_norm": 0.6929216929421017, "learning_rate": 5.219314575645972e-06, "loss": 0.03, "step": 1759 }, { "epoch": 0.5018534359851725, "grad_norm": 1.3885854817205727, "learning_rate": 5.214700297117989e-06, "loss": 0.0214, "step": 1760 }, { "epoch": 0.5021385799828914, "grad_norm": 1.0482514572374837, "learning_rate": 5.210085835392696e-06, "loss": 0.0229, "step": 1761 }, { "epoch": 0.5024237239806102, "grad_norm": 1.180111096741413, "learning_rate": 5.205471194407474e-06, "loss": 0.0476, "step": 1762 }, { "epoch": 0.502708867978329, "grad_norm": 1.8163067516658773, "learning_rate": 5.2008563780998554e-06, "loss": 0.0289, "step": 1763 }, { "epoch": 0.5029940119760479, "grad_norm": 0.7027677059471549, "learning_rate": 5.196241390407526e-06, "loss": 0.023, "step": 1764 }, { "epoch": 0.5032791559737667, "grad_norm": 1.8922631201234337, "learning_rate": 5.1916262352683154e-06, "loss": 0.0283, "step": 1765 }, { "epoch": 0.5035642999714856, "grad_norm": 1.5571212020941443, "learning_rate": 5.187010916620196e-06, "loss": 0.0361, "step": 1766 }, { "epoch": 0.5038494439692045, "grad_norm": 0.8174284451105421, "learning_rate": 5.182395438401282e-06, "loss": 0.0224, "step": 1767 }, { "epoch": 0.5041345879669233, "grad_norm": 1.3080120864075888, "learning_rate": 5.177779804549821e-06, "loss": 0.0142, "step": 1768 }, { "epoch": 0.5044197319646422, "grad_norm": 1.8386928220755963, "learning_rate": 5.173164019004195e-06, "loss": 0.0305, "step": 1769 }, { "epoch": 0.504704875962361, "grad_norm": 1.1936198989153632, "learning_rate": 5.168548085702916e-06, "loss": 0.0387, "step": 1770 }, { "epoch": 0.5049900199600799, "grad_norm": 1.6312386336058349, "learning_rate": 5.16393200858462e-06, "loss": 0.0306, "step": 1771 }, { "epoch": 0.5052751639577987, "grad_norm": 0.9145988600574612, "learning_rate": 5.159315791588068e-06, "loss": 0.011, "step": 1772 }, { "epoch": 0.5055603079555175, "grad_norm": 0.902930547135958, "learning_rate": 5.154699438652139e-06, "loss": 0.0148, "step": 1773 }, { "epoch": 0.5058454519532364, "grad_norm": 0.8416064963586531, "learning_rate": 5.150082953715826e-06, "loss": 0.0142, "step": 1774 }, { "epoch": 0.5061305959509552, "grad_norm": 0.8900714074477286, "learning_rate": 5.145466340718241e-06, "loss": 0.0219, "step": 1775 }, { "epoch": 0.5064157399486741, "grad_norm": 1.66826695692096, "learning_rate": 5.140849603598598e-06, "loss": 0.042, "step": 1776 }, { "epoch": 0.5067008839463929, "grad_norm": 2.656687757346725, "learning_rate": 5.13623274629622e-06, "loss": 0.0651, "step": 1777 }, { "epoch": 0.5069860279441117, "grad_norm": 0.5318371444639051, "learning_rate": 5.131615772750534e-06, "loss": 0.0124, "step": 1778 }, { "epoch": 0.5072711719418306, "grad_norm": 1.870474093394001, "learning_rate": 5.126998686901066e-06, "loss": 0.0444, "step": 1779 }, { "epoch": 0.5075563159395494, "grad_norm": 0.27375968758014385, "learning_rate": 5.122381492687431e-06, "loss": 0.0082, "step": 1780 }, { "epoch": 0.5078414599372684, "grad_norm": 0.5572083483107301, "learning_rate": 5.1177641940493505e-06, "loss": 0.0198, "step": 1781 }, { "epoch": 0.5081266039349872, "grad_norm": 1.8211201480961516, "learning_rate": 5.113146794926619e-06, "loss": 0.0605, "step": 1782 }, { "epoch": 0.508411747932706, "grad_norm": 1.629287587730149, "learning_rate": 5.108529299259129e-06, "loss": 0.0485, "step": 1783 }, { "epoch": 0.5086968919304249, "grad_norm": 1.2077436875656857, "learning_rate": 5.10391171098685e-06, "loss": 0.0262, "step": 1784 }, { "epoch": 0.5089820359281437, "grad_norm": 1.016814636419437, "learning_rate": 5.099294034049829e-06, "loss": 0.0424, "step": 1785 }, { "epoch": 0.5092671799258626, "grad_norm": 0.7224054605047718, "learning_rate": 5.0946762723881935e-06, "loss": 0.0101, "step": 1786 }, { "epoch": 0.5095523239235814, "grad_norm": 1.081931745901931, "learning_rate": 5.09005842994214e-06, "loss": 0.0277, "step": 1787 }, { "epoch": 0.5098374679213002, "grad_norm": 0.5980162131093523, "learning_rate": 5.0854405106519336e-06, "loss": 0.0144, "step": 1788 }, { "epoch": 0.5101226119190191, "grad_norm": 0.32617215799420385, "learning_rate": 5.080822518457909e-06, "loss": 0.0091, "step": 1789 }, { "epoch": 0.5104077559167379, "grad_norm": 1.637195558047833, "learning_rate": 5.0762044573004555e-06, "loss": 0.0346, "step": 1790 }, { "epoch": 0.5106928999144568, "grad_norm": 1.9376813290083361, "learning_rate": 5.071586331120031e-06, "loss": 0.0268, "step": 1791 }, { "epoch": 0.5109780439121756, "grad_norm": 0.8777109804793124, "learning_rate": 5.066968143857141e-06, "loss": 0.0099, "step": 1792 }, { "epoch": 0.5112631879098944, "grad_norm": 1.1639274694760067, "learning_rate": 5.062349899452345e-06, "loss": 0.0325, "step": 1793 }, { "epoch": 0.5115483319076134, "grad_norm": 0.9469853264768066, "learning_rate": 5.057731601846253e-06, "loss": 0.0388, "step": 1794 }, { "epoch": 0.5118334759053322, "grad_norm": 1.55965099823004, "learning_rate": 5.053113254979522e-06, "loss": 0.0317, "step": 1795 }, { "epoch": 0.5121186199030511, "grad_norm": 1.0572192459413616, "learning_rate": 5.048494862792845e-06, "loss": 0.0279, "step": 1796 }, { "epoch": 0.5124037639007699, "grad_norm": 0.6540128291643972, "learning_rate": 5.043876429226962e-06, "loss": 0.0082, "step": 1797 }, { "epoch": 0.5126889078984888, "grad_norm": 1.4677331238705937, "learning_rate": 5.039257958222638e-06, "loss": 0.052, "step": 1798 }, { "epoch": 0.5129740518962076, "grad_norm": 0.6886544253965207, "learning_rate": 5.0346394537206776e-06, "loss": 0.0224, "step": 1799 }, { "epoch": 0.5132591958939264, "grad_norm": 1.674371036871377, "learning_rate": 5.030020919661913e-06, "loss": 0.0487, "step": 1800 }, { "epoch": 0.5135443398916453, "grad_norm": 1.374316064323489, "learning_rate": 5.025402359987201e-06, "loss": 0.0288, "step": 1801 }, { "epoch": 0.5138294838893641, "grad_norm": 0.8573307882459512, "learning_rate": 5.020783778637415e-06, "loss": 0.0275, "step": 1802 }, { "epoch": 0.514114627887083, "grad_norm": 1.2301193990464756, "learning_rate": 5.016165179553459e-06, "loss": 0.0263, "step": 1803 }, { "epoch": 0.5143997718848018, "grad_norm": 1.2182295130590761, "learning_rate": 5.011546566676239e-06, "loss": 0.0359, "step": 1804 }, { "epoch": 0.5146849158825206, "grad_norm": 1.133488244714276, "learning_rate": 5.006927943946681e-06, "loss": 0.0167, "step": 1805 }, { "epoch": 0.5149700598802395, "grad_norm": 0.9176237886469092, "learning_rate": 5.0023093153057174e-06, "loss": 0.02, "step": 1806 }, { "epoch": 0.5152552038779584, "grad_norm": 1.0261123312743583, "learning_rate": 4.997690684694285e-06, "loss": 0.0333, "step": 1807 }, { "epoch": 0.5155403478756773, "grad_norm": 1.5110718674130426, "learning_rate": 4.99307205605332e-06, "loss": 0.0294, "step": 1808 }, { "epoch": 0.5158254918733961, "grad_norm": 0.6864276343112098, "learning_rate": 4.988453433323763e-06, "loss": 0.0347, "step": 1809 }, { "epoch": 0.5161106358711149, "grad_norm": 1.1248317448171876, "learning_rate": 4.983834820446542e-06, "loss": 0.0347, "step": 1810 }, { "epoch": 0.5163957798688338, "grad_norm": 0.9943083426147765, "learning_rate": 4.979216221362585e-06, "loss": 0.0255, "step": 1811 }, { "epoch": 0.5166809238665526, "grad_norm": 0.9053821929583953, "learning_rate": 4.9745976400128e-06, "loss": 0.0165, "step": 1812 }, { "epoch": 0.5169660678642715, "grad_norm": 1.3710164561870049, "learning_rate": 4.96997908033809e-06, "loss": 0.0246, "step": 1813 }, { "epoch": 0.5172512118619903, "grad_norm": 0.879598522359968, "learning_rate": 4.965360546279324e-06, "loss": 0.0239, "step": 1814 }, { "epoch": 0.5175363558597091, "grad_norm": 0.6133796107685641, "learning_rate": 4.960742041777365e-06, "loss": 0.016, "step": 1815 }, { "epoch": 0.517821499857428, "grad_norm": 0.7668338592272866, "learning_rate": 4.95612357077304e-06, "loss": 0.0233, "step": 1816 }, { "epoch": 0.5181066438551468, "grad_norm": 1.8711063892336894, "learning_rate": 4.951505137207155e-06, "loss": 0.0433, "step": 1817 }, { "epoch": 0.5183917878528657, "grad_norm": 1.4693009707813043, "learning_rate": 4.94688674502048e-06, "loss": 0.0192, "step": 1818 }, { "epoch": 0.5186769318505845, "grad_norm": 1.0043129018742638, "learning_rate": 4.942268398153748e-06, "loss": 0.032, "step": 1819 }, { "epoch": 0.5189620758483033, "grad_norm": 0.9700760923720372, "learning_rate": 4.937650100547656e-06, "loss": 0.03, "step": 1820 }, { "epoch": 0.5192472198460223, "grad_norm": 1.6874603824297643, "learning_rate": 4.933031856142862e-06, "loss": 0.0434, "step": 1821 }, { "epoch": 0.5195323638437411, "grad_norm": 1.6134747702626453, "learning_rate": 4.92841366887997e-06, "loss": 0.0249, "step": 1822 }, { "epoch": 0.51981750784146, "grad_norm": 0.7329551510874709, "learning_rate": 4.9237955426995445e-06, "loss": 0.0282, "step": 1823 }, { "epoch": 0.5201026518391788, "grad_norm": 1.502060656484669, "learning_rate": 4.919177481542094e-06, "loss": 0.0248, "step": 1824 }, { "epoch": 0.5203877958368976, "grad_norm": 1.0356299700200353, "learning_rate": 4.914559489348068e-06, "loss": 0.0133, "step": 1825 }, { "epoch": 0.5206729398346165, "grad_norm": 1.3304571261943245, "learning_rate": 4.909941570057861e-06, "loss": 0.0259, "step": 1826 }, { "epoch": 0.5209580838323353, "grad_norm": 0.8800083351648127, "learning_rate": 4.905323727611807e-06, "loss": 0.0253, "step": 1827 }, { "epoch": 0.5212432278300542, "grad_norm": 0.3567933218332538, "learning_rate": 4.900705965950172e-06, "loss": 0.012, "step": 1828 }, { "epoch": 0.521528371827773, "grad_norm": 0.8401451215832865, "learning_rate": 4.896088289013153e-06, "loss": 0.0191, "step": 1829 }, { "epoch": 0.5218135158254918, "grad_norm": 1.1818507943017555, "learning_rate": 4.891470700740872e-06, "loss": 0.0253, "step": 1830 }, { "epoch": 0.5220986598232107, "grad_norm": 0.9347321602247102, "learning_rate": 4.886853205073382e-06, "loss": 0.0204, "step": 1831 }, { "epoch": 0.5223838038209295, "grad_norm": 1.2285692447409555, "learning_rate": 4.882235805950651e-06, "loss": 0.0204, "step": 1832 }, { "epoch": 0.5226689478186484, "grad_norm": 0.43955725006957547, "learning_rate": 4.877618507312568e-06, "loss": 0.0174, "step": 1833 }, { "epoch": 0.5229540918163673, "grad_norm": 0.6565153634012186, "learning_rate": 4.873001313098937e-06, "loss": 0.0135, "step": 1834 }, { "epoch": 0.5232392358140862, "grad_norm": 1.113652633722711, "learning_rate": 4.868384227249468e-06, "loss": 0.0239, "step": 1835 }, { "epoch": 0.523524379811805, "grad_norm": 1.2861091962055604, "learning_rate": 4.8637672537037815e-06, "loss": 0.0519, "step": 1836 }, { "epoch": 0.5238095238095238, "grad_norm": 0.8292555437480343, "learning_rate": 4.859150396401404e-06, "loss": 0.0145, "step": 1837 }, { "epoch": 0.5240946678072427, "grad_norm": 2.2543232111596896, "learning_rate": 4.85453365928176e-06, "loss": 0.0325, "step": 1838 }, { "epoch": 0.5243798118049615, "grad_norm": 0.7480667232985087, "learning_rate": 4.849917046284175e-06, "loss": 0.0139, "step": 1839 }, { "epoch": 0.5246649558026804, "grad_norm": 0.4608883072551659, "learning_rate": 4.8453005613478635e-06, "loss": 0.0082, "step": 1840 }, { "epoch": 0.5249500998003992, "grad_norm": 0.9653852346298967, "learning_rate": 4.8406842084119345e-06, "loss": 0.0378, "step": 1841 }, { "epoch": 0.525235243798118, "grad_norm": 0.6538953169532171, "learning_rate": 4.8360679914153805e-06, "loss": 0.0236, "step": 1842 }, { "epoch": 0.5255203877958369, "grad_norm": 2.7130229909581063, "learning_rate": 4.831451914297086e-06, "loss": 0.0351, "step": 1843 }, { "epoch": 0.5258055317935557, "grad_norm": 0.9988544260022847, "learning_rate": 4.826835980995806e-06, "loss": 0.0313, "step": 1844 }, { "epoch": 0.5260906757912746, "grad_norm": 0.4820387580248128, "learning_rate": 4.82222019545018e-06, "loss": 0.0094, "step": 1845 }, { "epoch": 0.5263758197889934, "grad_norm": 0.6020909214467295, "learning_rate": 4.81760456159872e-06, "loss": 0.0207, "step": 1846 }, { "epoch": 0.5266609637867123, "grad_norm": 0.34297314496197695, "learning_rate": 4.812989083379806e-06, "loss": 0.0091, "step": 1847 }, { "epoch": 0.5269461077844312, "grad_norm": 0.9821597455371975, "learning_rate": 4.808373764731686e-06, "loss": 0.026, "step": 1848 }, { "epoch": 0.52723125178215, "grad_norm": 1.427547834808575, "learning_rate": 4.8037586095924756e-06, "loss": 0.04, "step": 1849 }, { "epoch": 0.5275163957798689, "grad_norm": 0.9056240346340622, "learning_rate": 4.799143621900146e-06, "loss": 0.0355, "step": 1850 }, { "epoch": 0.5278015397775877, "grad_norm": 1.603722643513683, "learning_rate": 4.794528805592529e-06, "loss": 0.0342, "step": 1851 }, { "epoch": 0.5280866837753065, "grad_norm": 0.5955654585757876, "learning_rate": 4.789914164607305e-06, "loss": 0.0158, "step": 1852 }, { "epoch": 0.5283718277730254, "grad_norm": 1.1955850884312524, "learning_rate": 4.785299702882012e-06, "loss": 0.0153, "step": 1853 }, { "epoch": 0.5286569717707442, "grad_norm": 0.8638149177647332, "learning_rate": 4.7806854243540304e-06, "loss": 0.0283, "step": 1854 }, { "epoch": 0.5289421157684631, "grad_norm": 0.9683679448303014, "learning_rate": 4.776071332960586e-06, "loss": 0.0224, "step": 1855 }, { "epoch": 0.5292272597661819, "grad_norm": 1.0544906834955496, "learning_rate": 4.771457432638744e-06, "loss": 0.018, "step": 1856 }, { "epoch": 0.5295124037639007, "grad_norm": 0.5133813970021956, "learning_rate": 4.766843727325408e-06, "loss": 0.0095, "step": 1857 }, { "epoch": 0.5297975477616196, "grad_norm": 1.0106462976613688, "learning_rate": 4.7622302209573105e-06, "loss": 0.0304, "step": 1858 }, { "epoch": 0.5300826917593384, "grad_norm": 1.4067320553021943, "learning_rate": 4.757616917471021e-06, "loss": 0.0326, "step": 1859 }, { "epoch": 0.5303678357570573, "grad_norm": 0.9000475309193106, "learning_rate": 4.753003820802933e-06, "loss": 0.0154, "step": 1860 }, { "epoch": 0.5306529797547762, "grad_norm": 0.7559939464533812, "learning_rate": 4.748390934889261e-06, "loss": 0.0252, "step": 1861 }, { "epoch": 0.530938123752495, "grad_norm": 0.7567111911478112, "learning_rate": 4.743778263666045e-06, "loss": 0.0098, "step": 1862 }, { "epoch": 0.5312232677502139, "grad_norm": 2.6084997004648582, "learning_rate": 4.739165811069135e-06, "loss": 0.0385, "step": 1863 }, { "epoch": 0.5315084117479327, "grad_norm": 0.7239400307328033, "learning_rate": 4.734553581034199e-06, "loss": 0.0275, "step": 1864 }, { "epoch": 0.5317935557456516, "grad_norm": 0.7103481052891866, "learning_rate": 4.729941577496718e-06, "loss": 0.0091, "step": 1865 }, { "epoch": 0.5320786997433704, "grad_norm": 1.4097950329346804, "learning_rate": 4.7253298043919725e-06, "loss": 0.0234, "step": 1866 }, { "epoch": 0.5323638437410892, "grad_norm": 1.7077434751311544, "learning_rate": 4.72071826565505e-06, "loss": 0.0292, "step": 1867 }, { "epoch": 0.5326489877388081, "grad_norm": 0.5457004148411975, "learning_rate": 4.71610696522084e-06, "loss": 0.009, "step": 1868 }, { "epoch": 0.5329341317365269, "grad_norm": 0.5406920476646435, "learning_rate": 4.7114959070240254e-06, "loss": 0.0079, "step": 1869 }, { "epoch": 0.5332192757342458, "grad_norm": 1.8334680965862749, "learning_rate": 4.706885094999082e-06, "loss": 0.0344, "step": 1870 }, { "epoch": 0.5335044197319646, "grad_norm": 1.011692063206279, "learning_rate": 4.70227453308028e-06, "loss": 0.0244, "step": 1871 }, { "epoch": 0.5337895637296834, "grad_norm": 0.8936841628052329, "learning_rate": 4.697664225201671e-06, "loss": 0.0127, "step": 1872 }, { "epoch": 0.5340747077274023, "grad_norm": 0.8113649868154692, "learning_rate": 4.693054175297095e-06, "loss": 0.0192, "step": 1873 }, { "epoch": 0.5343598517251212, "grad_norm": 1.1137611632560478, "learning_rate": 4.688444387300165e-06, "loss": 0.0284, "step": 1874 }, { "epoch": 0.5346449957228401, "grad_norm": 0.9679934453395643, "learning_rate": 4.683834865144277e-06, "loss": 0.0222, "step": 1875 }, { "epoch": 0.5349301397205589, "grad_norm": 2.406256084642815, "learning_rate": 4.679225612762598e-06, "loss": 0.0469, "step": 1876 }, { "epoch": 0.5352152837182778, "grad_norm": 1.9285676628551522, "learning_rate": 4.6746166340880636e-06, "loss": 0.0238, "step": 1877 }, { "epoch": 0.5355004277159966, "grad_norm": 1.5021513223725813, "learning_rate": 4.6700079330533775e-06, "loss": 0.0436, "step": 1878 }, { "epoch": 0.5357855717137154, "grad_norm": 1.0941741453384195, "learning_rate": 4.665399513591008e-06, "loss": 0.0329, "step": 1879 }, { "epoch": 0.5360707157114343, "grad_norm": 0.39730754365810067, "learning_rate": 4.660791379633175e-06, "loss": 0.01, "step": 1880 }, { "epoch": 0.5363558597091531, "grad_norm": 1.7227973412048632, "learning_rate": 4.656183535111865e-06, "loss": 0.0294, "step": 1881 }, { "epoch": 0.536641003706872, "grad_norm": 0.4622647741414944, "learning_rate": 4.651575983958813e-06, "loss": 0.0105, "step": 1882 }, { "epoch": 0.5369261477045908, "grad_norm": 0.4489009206942475, "learning_rate": 4.646968730105503e-06, "loss": 0.0092, "step": 1883 }, { "epoch": 0.5372112917023096, "grad_norm": 0.6693708589940758, "learning_rate": 4.6423617774831675e-06, "loss": 0.0131, "step": 1884 }, { "epoch": 0.5374964357000285, "grad_norm": 0.8512303652055226, "learning_rate": 4.63775513002278e-06, "loss": 0.0085, "step": 1885 }, { "epoch": 0.5377815796977473, "grad_norm": 1.6322548013574654, "learning_rate": 4.633148791655053e-06, "loss": 0.0318, "step": 1886 }, { "epoch": 0.5380667236954663, "grad_norm": 1.2405260613430487, "learning_rate": 4.628542766310438e-06, "loss": 0.0147, "step": 1887 }, { "epoch": 0.5383518676931851, "grad_norm": 0.4807074519508227, "learning_rate": 4.623937057919118e-06, "loss": 0.0111, "step": 1888 }, { "epoch": 0.5386370116909039, "grad_norm": 1.3954446874800244, "learning_rate": 4.619331670411005e-06, "loss": 0.0262, "step": 1889 }, { "epoch": 0.5389221556886228, "grad_norm": 1.116115723360862, "learning_rate": 4.61472660771574e-06, "loss": 0.0225, "step": 1890 }, { "epoch": 0.5392072996863416, "grad_norm": 0.5958119317926713, "learning_rate": 4.610121873762681e-06, "loss": 0.0177, "step": 1891 }, { "epoch": 0.5394924436840605, "grad_norm": 1.299507428802239, "learning_rate": 4.605517472480912e-06, "loss": 0.0259, "step": 1892 }, { "epoch": 0.5397775876817793, "grad_norm": 0.8638980441903176, "learning_rate": 4.6009134077992276e-06, "loss": 0.0165, "step": 1893 }, { "epoch": 0.5400627316794981, "grad_norm": 1.549781320683181, "learning_rate": 4.5963096836461405e-06, "loss": 0.0241, "step": 1894 }, { "epoch": 0.540347875677217, "grad_norm": 0.8766341228007742, "learning_rate": 4.5917063039498705e-06, "loss": 0.0163, "step": 1895 }, { "epoch": 0.5406330196749358, "grad_norm": 0.7287071167783781, "learning_rate": 4.587103272638339e-06, "loss": 0.0062, "step": 1896 }, { "epoch": 0.5409181636726547, "grad_norm": 0.8942627426261667, "learning_rate": 4.582500593639178e-06, "loss": 0.0124, "step": 1897 }, { "epoch": 0.5412033076703735, "grad_norm": 0.9701042144339818, "learning_rate": 4.5778982708797146e-06, "loss": 0.0094, "step": 1898 }, { "epoch": 0.5414884516680923, "grad_norm": 0.755399450943285, "learning_rate": 4.573296308286971e-06, "loss": 0.0141, "step": 1899 }, { "epoch": 0.5417735956658112, "grad_norm": 0.7506799238469901, "learning_rate": 4.568694709787665e-06, "loss": 0.0257, "step": 1900 }, { "epoch": 0.5420587396635301, "grad_norm": 1.2250538112210514, "learning_rate": 4.564093479308205e-06, "loss": 0.0179, "step": 1901 }, { "epoch": 0.542343883661249, "grad_norm": 1.6755192524244813, "learning_rate": 4.559492620774676e-06, "loss": 0.0249, "step": 1902 }, { "epoch": 0.5426290276589678, "grad_norm": 0.8966395601407366, "learning_rate": 4.554892138112856e-06, "loss": 0.0127, "step": 1903 }, { "epoch": 0.5429141716566867, "grad_norm": 1.5035988745836084, "learning_rate": 4.5502920352482004e-06, "loss": 0.0439, "step": 1904 }, { "epoch": 0.5431993156544055, "grad_norm": 1.7840819508041883, "learning_rate": 4.545692316105835e-06, "loss": 0.0239, "step": 1905 }, { "epoch": 0.5434844596521243, "grad_norm": 0.31678794815398437, "learning_rate": 4.541092984610566e-06, "loss": 0.0033, "step": 1906 }, { "epoch": 0.5437696036498432, "grad_norm": 1.0022488627050703, "learning_rate": 4.536494044686863e-06, "loss": 0.0187, "step": 1907 }, { "epoch": 0.544054747647562, "grad_norm": 2.297509688971944, "learning_rate": 4.531895500258861e-06, "loss": 0.0299, "step": 1908 }, { "epoch": 0.5443398916452808, "grad_norm": 1.5802445456531595, "learning_rate": 4.5272973552503614e-06, "loss": 0.0303, "step": 1909 }, { "epoch": 0.5446250356429997, "grad_norm": 0.7996338326856399, "learning_rate": 4.522699613584825e-06, "loss": 0.0123, "step": 1910 }, { "epoch": 0.5449101796407185, "grad_norm": 1.5678553036220793, "learning_rate": 4.5181022791853625e-06, "loss": 0.0278, "step": 1911 }, { "epoch": 0.5451953236384374, "grad_norm": 1.2576223644296152, "learning_rate": 4.513505355974747e-06, "loss": 0.0215, "step": 1912 }, { "epoch": 0.5454804676361562, "grad_norm": 2.7271023649179162, "learning_rate": 4.50890884787539e-06, "loss": 0.0357, "step": 1913 }, { "epoch": 0.5457656116338752, "grad_norm": 0.5599793478699585, "learning_rate": 4.5043127588093545e-06, "loss": 0.0136, "step": 1914 }, { "epoch": 0.546050755631594, "grad_norm": 1.134277118458878, "learning_rate": 4.4997170926983465e-06, "loss": 0.0126, "step": 1915 }, { "epoch": 0.5463358996293128, "grad_norm": 1.7917881712046206, "learning_rate": 4.495121853463709e-06, "loss": 0.0273, "step": 1916 }, { "epoch": 0.5466210436270317, "grad_norm": 2.0607418747445347, "learning_rate": 4.490527045026423e-06, "loss": 0.0351, "step": 1917 }, { "epoch": 0.5469061876247505, "grad_norm": 0.4922356489606496, "learning_rate": 4.4859326713070996e-06, "loss": 0.008, "step": 1918 }, { "epoch": 0.5471913316224694, "grad_norm": 0.9315432349848304, "learning_rate": 4.481338736225978e-06, "loss": 0.0252, "step": 1919 }, { "epoch": 0.5474764756201882, "grad_norm": 1.2682141717774775, "learning_rate": 4.476745243702927e-06, "loss": 0.0212, "step": 1920 }, { "epoch": 0.547761619617907, "grad_norm": 0.850819395547047, "learning_rate": 4.472152197657434e-06, "loss": 0.0206, "step": 1921 }, { "epoch": 0.5480467636156259, "grad_norm": 0.8083997376498508, "learning_rate": 4.46755960200861e-06, "loss": 0.0197, "step": 1922 }, { "epoch": 0.5483319076133447, "grad_norm": 1.4339268897454358, "learning_rate": 4.462967460675178e-06, "loss": 0.0295, "step": 1923 }, { "epoch": 0.5486170516110636, "grad_norm": 2.1314141503209276, "learning_rate": 4.458375777575473e-06, "loss": 0.056, "step": 1924 }, { "epoch": 0.5489021956087824, "grad_norm": 1.7031785767526983, "learning_rate": 4.453784556627439e-06, "loss": 0.0476, "step": 1925 }, { "epoch": 0.5491873396065012, "grad_norm": 1.2263205466527023, "learning_rate": 4.449193801748629e-06, "loss": 0.016, "step": 1926 }, { "epoch": 0.5494724836042202, "grad_norm": 0.7947498299549632, "learning_rate": 4.4446035168561955e-06, "loss": 0.0155, "step": 1927 }, { "epoch": 0.549757627601939, "grad_norm": 0.4966588434774226, "learning_rate": 4.440013705866895e-06, "loss": 0.0085, "step": 1928 }, { "epoch": 0.5500427715996579, "grad_norm": 1.6419794017402862, "learning_rate": 4.435424372697066e-06, "loss": 0.0344, "step": 1929 }, { "epoch": 0.5503279155973767, "grad_norm": 0.8164655165099096, "learning_rate": 4.430835521262656e-06, "loss": 0.0304, "step": 1930 }, { "epoch": 0.5506130595950955, "grad_norm": 1.6944102182051353, "learning_rate": 4.4262471554791895e-06, "loss": 0.0395, "step": 1931 }, { "epoch": 0.5508982035928144, "grad_norm": 1.7309887958840326, "learning_rate": 4.421659279261785e-06, "loss": 0.0425, "step": 1932 }, { "epoch": 0.5511833475905332, "grad_norm": 2.5273881172214496, "learning_rate": 4.417071896525138e-06, "loss": 0.0521, "step": 1933 }, { "epoch": 0.5514684915882521, "grad_norm": 3.4173362033274346, "learning_rate": 4.412485011183525e-06, "loss": 0.0638, "step": 1934 }, { "epoch": 0.5517536355859709, "grad_norm": 0.9743219257548228, "learning_rate": 4.407898627150795e-06, "loss": 0.0186, "step": 1935 }, { "epoch": 0.5520387795836897, "grad_norm": 2.349422676591217, "learning_rate": 4.403312748340373e-06, "loss": 0.0569, "step": 1936 }, { "epoch": 0.5523239235814086, "grad_norm": 0.53449757299917, "learning_rate": 4.398727378665252e-06, "loss": 0.0097, "step": 1937 }, { "epoch": 0.5526090675791274, "grad_norm": 1.0831620617986117, "learning_rate": 4.394142522037991e-06, "loss": 0.0263, "step": 1938 }, { "epoch": 0.5528942115768463, "grad_norm": 1.383221056170318, "learning_rate": 4.38955818237071e-06, "loss": 0.0337, "step": 1939 }, { "epoch": 0.5531793555745651, "grad_norm": 1.6813526071732885, "learning_rate": 4.384974363575086e-06, "loss": 0.0291, "step": 1940 }, { "epoch": 0.553464499572284, "grad_norm": 0.41544282220199685, "learning_rate": 4.3803910695623555e-06, "loss": 0.017, "step": 1941 }, { "epoch": 0.5537496435700029, "grad_norm": 2.098566976792298, "learning_rate": 4.375808304243305e-06, "loss": 0.0207, "step": 1942 }, { "epoch": 0.5540347875677217, "grad_norm": 0.5784708780835224, "learning_rate": 4.3712260715282716e-06, "loss": 0.007, "step": 1943 }, { "epoch": 0.5543199315654406, "grad_norm": 1.2225541332263947, "learning_rate": 4.3666443753271355e-06, "loss": 0.0319, "step": 1944 }, { "epoch": 0.5546050755631594, "grad_norm": 1.08491250220113, "learning_rate": 4.362063219549323e-06, "loss": 0.0229, "step": 1945 }, { "epoch": 0.5548902195608783, "grad_norm": 0.6817244520114021, "learning_rate": 4.3574826081037935e-06, "loss": 0.0214, "step": 1946 }, { "epoch": 0.5551753635585971, "grad_norm": 1.1664569501875723, "learning_rate": 4.352902544899044e-06, "loss": 0.0229, "step": 1947 }, { "epoch": 0.5554605075563159, "grad_norm": 1.293836452879822, "learning_rate": 4.34832303384311e-06, "loss": 0.0156, "step": 1948 }, { "epoch": 0.5557456515540348, "grad_norm": 0.8022586547523411, "learning_rate": 4.343744078843545e-06, "loss": 0.0338, "step": 1949 }, { "epoch": 0.5560307955517536, "grad_norm": 0.6083879507772137, "learning_rate": 4.33916568380744e-06, "loss": 0.0184, "step": 1950 }, { "epoch": 0.5563159395494724, "grad_norm": 0.7357572185407785, "learning_rate": 4.334587852641396e-06, "loss": 0.0127, "step": 1951 }, { "epoch": 0.5566010835471913, "grad_norm": 0.7103951655209286, "learning_rate": 4.330010589251541e-06, "loss": 0.0137, "step": 1952 }, { "epoch": 0.5568862275449101, "grad_norm": 2.0436097540893847, "learning_rate": 4.325433897543516e-06, "loss": 0.0439, "step": 1953 }, { "epoch": 0.5571713715426291, "grad_norm": 1.0347939995255582, "learning_rate": 4.320857781422476e-06, "loss": 0.0144, "step": 1954 }, { "epoch": 0.5574565155403479, "grad_norm": 1.2205245382671843, "learning_rate": 4.31628224479308e-06, "loss": 0.0343, "step": 1955 }, { "epoch": 0.5577416595380668, "grad_norm": 0.48096511602458747, "learning_rate": 4.311707291559502e-06, "loss": 0.0151, "step": 1956 }, { "epoch": 0.5580268035357856, "grad_norm": 1.858218874925223, "learning_rate": 4.307132925625405e-06, "loss": 0.024, "step": 1957 }, { "epoch": 0.5583119475335044, "grad_norm": 0.41558860469055026, "learning_rate": 4.302559150893962e-06, "loss": 0.0081, "step": 1958 }, { "epoch": 0.5585970915312233, "grad_norm": 0.6185726461872646, "learning_rate": 4.297985971267836e-06, "loss": 0.0148, "step": 1959 }, { "epoch": 0.5588822355289421, "grad_norm": 1.629041987647595, "learning_rate": 4.293413390649186e-06, "loss": 0.0382, "step": 1960 }, { "epoch": 0.559167379526661, "grad_norm": 1.1013738682804133, "learning_rate": 4.288841412939656e-06, "loss": 0.0123, "step": 1961 }, { "epoch": 0.5594525235243798, "grad_norm": 0.7207246603680382, "learning_rate": 4.284270042040377e-06, "loss": 0.0092, "step": 1962 }, { "epoch": 0.5597376675220986, "grad_norm": 1.3146322345300236, "learning_rate": 4.279699281851962e-06, "loss": 0.0497, "step": 1963 }, { "epoch": 0.5600228115198175, "grad_norm": 1.3720103682911193, "learning_rate": 4.275129136274505e-06, "loss": 0.02, "step": 1964 }, { "epoch": 0.5603079555175363, "grad_norm": 0.4413728823645133, "learning_rate": 4.270559609207572e-06, "loss": 0.0068, "step": 1965 }, { "epoch": 0.5605930995152552, "grad_norm": 1.042243710858581, "learning_rate": 4.265990704550204e-06, "loss": 0.0202, "step": 1966 }, { "epoch": 0.5608782435129741, "grad_norm": 1.6738885287470393, "learning_rate": 4.261422426200912e-06, "loss": 0.05, "step": 1967 }, { "epoch": 0.561163387510693, "grad_norm": 0.6925766021391758, "learning_rate": 4.256854778057668e-06, "loss": 0.0084, "step": 1968 }, { "epoch": 0.5614485315084118, "grad_norm": 1.3643708719882088, "learning_rate": 4.252287764017908e-06, "loss": 0.0386, "step": 1969 }, { "epoch": 0.5617336755061306, "grad_norm": 0.5506155217176552, "learning_rate": 4.24772138797853e-06, "loss": 0.0084, "step": 1970 }, { "epoch": 0.5620188195038495, "grad_norm": 0.555170139364492, "learning_rate": 4.243155653835886e-06, "loss": 0.0118, "step": 1971 }, { "epoch": 0.5623039635015683, "grad_norm": 0.9763249846495575, "learning_rate": 4.238590565485779e-06, "loss": 0.0139, "step": 1972 }, { "epoch": 0.5625891074992871, "grad_norm": 0.7772741042704194, "learning_rate": 4.23402612682346e-06, "loss": 0.0125, "step": 1973 }, { "epoch": 0.562874251497006, "grad_norm": 0.40993352316801507, "learning_rate": 4.2294623417436284e-06, "loss": 0.0057, "step": 1974 }, { "epoch": 0.5631593954947248, "grad_norm": 2.595191318759739, "learning_rate": 4.224899214140425e-06, "loss": 0.0483, "step": 1975 }, { "epoch": 0.5634445394924437, "grad_norm": 1.0047435454745783, "learning_rate": 4.220336747907431e-06, "loss": 0.0387, "step": 1976 }, { "epoch": 0.5637296834901625, "grad_norm": 0.8252660492184049, "learning_rate": 4.2157749469376594e-06, "loss": 0.0179, "step": 1977 }, { "epoch": 0.5640148274878813, "grad_norm": 1.7631265123768307, "learning_rate": 4.21121381512356e-06, "loss": 0.0259, "step": 1978 }, { "epoch": 0.5642999714856002, "grad_norm": 1.1505145111500783, "learning_rate": 4.206653356357007e-06, "loss": 0.0294, "step": 1979 }, { "epoch": 0.564585115483319, "grad_norm": 1.4455244317990026, "learning_rate": 4.202093574529305e-06, "loss": 0.0312, "step": 1980 }, { "epoch": 0.564870259481038, "grad_norm": 0.8626915090427404, "learning_rate": 4.197534473531177e-06, "loss": 0.0276, "step": 1981 }, { "epoch": 0.5651554034787568, "grad_norm": 1.5858817877250049, "learning_rate": 4.192976057252768e-06, "loss": 0.0192, "step": 1982 }, { "epoch": 0.5654405474764757, "grad_norm": 1.410131591822338, "learning_rate": 4.18841832958364e-06, "loss": 0.0247, "step": 1983 }, { "epoch": 0.5657256914741945, "grad_norm": 0.7445170622774078, "learning_rate": 4.18386129441276e-06, "loss": 0.0121, "step": 1984 }, { "epoch": 0.5660108354719133, "grad_norm": 0.8809924184189645, "learning_rate": 4.179304955628511e-06, "loss": 0.019, "step": 1985 }, { "epoch": 0.5662959794696322, "grad_norm": 1.5083664563446835, "learning_rate": 4.174749317118683e-06, "loss": 0.0261, "step": 1986 }, { "epoch": 0.566581123467351, "grad_norm": 2.2901675873569656, "learning_rate": 4.170194382770462e-06, "loss": 0.0279, "step": 1987 }, { "epoch": 0.5668662674650699, "grad_norm": 1.5826055506344388, "learning_rate": 4.165640156470436e-06, "loss": 0.0337, "step": 1988 }, { "epoch": 0.5671514114627887, "grad_norm": 1.486333132761605, "learning_rate": 4.161086642104594e-06, "loss": 0.0147, "step": 1989 }, { "epoch": 0.5674365554605075, "grad_norm": 1.2557151112171123, "learning_rate": 4.156533843558309e-06, "loss": 0.0224, "step": 1990 }, { "epoch": 0.5677216994582264, "grad_norm": 0.682005313725667, "learning_rate": 4.151981764716347e-06, "loss": 0.0079, "step": 1991 }, { "epoch": 0.5680068434559452, "grad_norm": 2.0989444982377665, "learning_rate": 4.147430409462863e-06, "loss": 0.0654, "step": 1992 }, { "epoch": 0.568291987453664, "grad_norm": 1.3172125697333474, "learning_rate": 4.142879781681389e-06, "loss": 0.0382, "step": 1993 }, { "epoch": 0.568577131451383, "grad_norm": 1.0200075411284906, "learning_rate": 4.138329885254842e-06, "loss": 0.0155, "step": 1994 }, { "epoch": 0.5688622754491018, "grad_norm": 1.6810817501742972, "learning_rate": 4.133780724065508e-06, "loss": 0.0284, "step": 1995 }, { "epoch": 0.5691474194468207, "grad_norm": 0.3058438071030809, "learning_rate": 4.1292323019950515e-06, "loss": 0.0057, "step": 1996 }, { "epoch": 0.5694325634445395, "grad_norm": 3.2345016887299174, "learning_rate": 4.124684622924503e-06, "loss": 0.0603, "step": 1997 }, { "epoch": 0.5697177074422584, "grad_norm": 0.25562993510263476, "learning_rate": 4.120137690734264e-06, "loss": 0.0058, "step": 1998 }, { "epoch": 0.5700028514399772, "grad_norm": 0.9607045079360945, "learning_rate": 4.115591509304092e-06, "loss": 0.019, "step": 1999 }, { "epoch": 0.570287995437696, "grad_norm": 0.6007173797861988, "learning_rate": 4.111046082513109e-06, "loss": 0.0082, "step": 2000 }, { "epoch": 0.5705731394354149, "grad_norm": 0.8863429154698693, "learning_rate": 4.106501414239787e-06, "loss": 0.0118, "step": 2001 }, { "epoch": 0.5708582834331337, "grad_norm": 1.5083003179196242, "learning_rate": 4.1019575083619595e-06, "loss": 0.0158, "step": 2002 }, { "epoch": 0.5711434274308526, "grad_norm": 0.7391504005003044, "learning_rate": 4.097414368756803e-06, "loss": 0.0088, "step": 2003 }, { "epoch": 0.5714285714285714, "grad_norm": 0.6832287705746387, "learning_rate": 4.092871999300841e-06, "loss": 0.0094, "step": 2004 }, { "epoch": 0.5717137154262902, "grad_norm": 1.6772859676317076, "learning_rate": 4.088330403869943e-06, "loss": 0.0478, "step": 2005 }, { "epoch": 0.5719988594240091, "grad_norm": 1.0306291075921608, "learning_rate": 4.083789586339315e-06, "loss": 0.0237, "step": 2006 }, { "epoch": 0.572284003421728, "grad_norm": 1.184619272802862, "learning_rate": 4.079249550583498e-06, "loss": 0.0283, "step": 2007 }, { "epoch": 0.5725691474194469, "grad_norm": 1.4096930747734462, "learning_rate": 4.074710300476372e-06, "loss": 0.014, "step": 2008 }, { "epoch": 0.5728542914171657, "grad_norm": 0.783881045789122, "learning_rate": 4.07017183989114e-06, "loss": 0.0115, "step": 2009 }, { "epoch": 0.5731394354148845, "grad_norm": 0.30172775611238883, "learning_rate": 4.065634172700335e-06, "loss": 0.0045, "step": 2010 }, { "epoch": 0.5734245794126034, "grad_norm": 0.6976220498401472, "learning_rate": 4.0610973027758124e-06, "loss": 0.0272, "step": 2011 }, { "epoch": 0.5737097234103222, "grad_norm": 1.1052303028320305, "learning_rate": 4.0565612339887455e-06, "loss": 0.017, "step": 2012 }, { "epoch": 0.5739948674080411, "grad_norm": 1.7404973231076455, "learning_rate": 4.052025970209626e-06, "loss": 0.0392, "step": 2013 }, { "epoch": 0.5742800114057599, "grad_norm": 0.674052952485932, "learning_rate": 4.047491515308259e-06, "loss": 0.0082, "step": 2014 }, { "epoch": 0.5745651554034787, "grad_norm": 0.7117202921993683, "learning_rate": 4.042957873153758e-06, "loss": 0.0178, "step": 2015 }, { "epoch": 0.5748502994011976, "grad_norm": 1.1146848695110856, "learning_rate": 4.038425047614545e-06, "loss": 0.0295, "step": 2016 }, { "epoch": 0.5751354433989164, "grad_norm": 0.4209229851025855, "learning_rate": 4.033893042558341e-06, "loss": 0.0121, "step": 2017 }, { "epoch": 0.5754205873966353, "grad_norm": 0.48277471651577564, "learning_rate": 4.029361861852171e-06, "loss": 0.0069, "step": 2018 }, { "epoch": 0.5757057313943541, "grad_norm": 1.7222798865166078, "learning_rate": 4.024831509362357e-06, "loss": 0.0298, "step": 2019 }, { "epoch": 0.5759908753920729, "grad_norm": 1.0337635489366164, "learning_rate": 4.020301988954511e-06, "loss": 0.0159, "step": 2020 }, { "epoch": 0.5762760193897919, "grad_norm": 1.3775357400508321, "learning_rate": 4.015773304493536e-06, "loss": 0.0158, "step": 2021 }, { "epoch": 0.5765611633875107, "grad_norm": 0.5083277988878048, "learning_rate": 4.011245459843625e-06, "loss": 0.0067, "step": 2022 }, { "epoch": 0.5768463073852296, "grad_norm": 0.5286132139270061, "learning_rate": 4.006718458868248e-06, "loss": 0.0062, "step": 2023 }, { "epoch": 0.5771314513829484, "grad_norm": 1.1038055454886817, "learning_rate": 4.002192305430162e-06, "loss": 0.0129, "step": 2024 }, { "epoch": 0.5774165953806673, "grad_norm": 1.1714514779633876, "learning_rate": 3.997667003391397e-06, "loss": 0.031, "step": 2025 }, { "epoch": 0.5777017393783861, "grad_norm": 1.7218432058203235, "learning_rate": 3.993142556613255e-06, "loss": 0.0407, "step": 2026 }, { "epoch": 0.5779868833761049, "grad_norm": 1.3615661214766888, "learning_rate": 3.988618968956315e-06, "loss": 0.0442, "step": 2027 }, { "epoch": 0.5782720273738238, "grad_norm": 1.9624857942185958, "learning_rate": 3.9840962442804145e-06, "loss": 0.0478, "step": 2028 }, { "epoch": 0.5785571713715426, "grad_norm": 0.4209016778937654, "learning_rate": 3.97957438644466e-06, "loss": 0.0074, "step": 2029 }, { "epoch": 0.5788423153692615, "grad_norm": 2.0516788767101795, "learning_rate": 3.975053399307416e-06, "loss": 0.0409, "step": 2030 }, { "epoch": 0.5791274593669803, "grad_norm": 0.8532747901837053, "learning_rate": 3.970533286726306e-06, "loss": 0.0157, "step": 2031 }, { "epoch": 0.5794126033646991, "grad_norm": 0.8877673247289528, "learning_rate": 3.966014052558206e-06, "loss": 0.0167, "step": 2032 }, { "epoch": 0.579697747362418, "grad_norm": 1.0452094981847408, "learning_rate": 3.961495700659243e-06, "loss": 0.0177, "step": 2033 }, { "epoch": 0.5799828913601369, "grad_norm": 2.336128746094747, "learning_rate": 3.956978234884791e-06, "loss": 0.0511, "step": 2034 }, { "epoch": 0.5802680353578558, "grad_norm": 1.3293832741262577, "learning_rate": 3.952461659089467e-06, "loss": 0.0206, "step": 2035 }, { "epoch": 0.5805531793555746, "grad_norm": 3.2429992646007246, "learning_rate": 3.947945977127129e-06, "loss": 0.0435, "step": 2036 }, { "epoch": 0.5808383233532934, "grad_norm": 1.4889692622010537, "learning_rate": 3.943431192850874e-06, "loss": 0.0168, "step": 2037 }, { "epoch": 0.5811234673510123, "grad_norm": 0.7779287362969187, "learning_rate": 3.93891731011303e-06, "loss": 0.0081, "step": 2038 }, { "epoch": 0.5814086113487311, "grad_norm": 1.1967880354641325, "learning_rate": 3.934404332765161e-06, "loss": 0.0154, "step": 2039 }, { "epoch": 0.58169375534645, "grad_norm": 1.1772392533629603, "learning_rate": 3.929892264658052e-06, "loss": 0.0269, "step": 2040 }, { "epoch": 0.5819788993441688, "grad_norm": 1.8826797992290683, "learning_rate": 3.925381109641715e-06, "loss": 0.0459, "step": 2041 }, { "epoch": 0.5822640433418876, "grad_norm": 0.8638860916601324, "learning_rate": 3.9208708715653825e-06, "loss": 0.0174, "step": 2042 }, { "epoch": 0.5825491873396065, "grad_norm": 1.0986393666105592, "learning_rate": 3.916361554277508e-06, "loss": 0.019, "step": 2043 }, { "epoch": 0.5828343313373253, "grad_norm": 0.9183125572551752, "learning_rate": 3.911853161625756e-06, "loss": 0.0183, "step": 2044 }, { "epoch": 0.5831194753350442, "grad_norm": 1.9954417238979274, "learning_rate": 3.9073456974570004e-06, "loss": 0.0218, "step": 2045 }, { "epoch": 0.583404619332763, "grad_norm": 0.6191794329765098, "learning_rate": 3.902839165617323e-06, "loss": 0.014, "step": 2046 }, { "epoch": 0.583689763330482, "grad_norm": 1.4231403686313335, "learning_rate": 3.898333569952018e-06, "loss": 0.0296, "step": 2047 }, { "epoch": 0.5839749073282008, "grad_norm": 0.8776537238404536, "learning_rate": 3.89382891430557e-06, "loss": 0.0083, "step": 2048 }, { "epoch": 0.5842600513259196, "grad_norm": 0.8708883202320266, "learning_rate": 3.889325202521668e-06, "loss": 0.0202, "step": 2049 }, { "epoch": 0.5845451953236385, "grad_norm": 0.8831775072235944, "learning_rate": 3.884822438443196e-06, "loss": 0.024, "step": 2050 }, { "epoch": 0.5848303393213573, "grad_norm": 2.2216278743868196, "learning_rate": 3.880320625912224e-06, "loss": 0.0326, "step": 2051 }, { "epoch": 0.5851154833190761, "grad_norm": 0.6026223557249291, "learning_rate": 3.875819768770012e-06, "loss": 0.0197, "step": 2052 }, { "epoch": 0.585400627316795, "grad_norm": 1.4057182511451267, "learning_rate": 3.87131987085701e-06, "loss": 0.028, "step": 2053 }, { "epoch": 0.5856857713145138, "grad_norm": 1.487605532299638, "learning_rate": 3.866820936012844e-06, "loss": 0.0113, "step": 2054 }, { "epoch": 0.5859709153122327, "grad_norm": 1.4068652241281197, "learning_rate": 3.862322968076322e-06, "loss": 0.0553, "step": 2055 }, { "epoch": 0.5862560593099515, "grad_norm": 0.6717247126069004, "learning_rate": 3.857825970885422e-06, "loss": 0.0162, "step": 2056 }, { "epoch": 0.5865412033076703, "grad_norm": 0.2160215600154683, "learning_rate": 3.853329948277297e-06, "loss": 0.0048, "step": 2057 }, { "epoch": 0.5868263473053892, "grad_norm": 0.7554944197765426, "learning_rate": 3.848834904088268e-06, "loss": 0.0157, "step": 2058 }, { "epoch": 0.587111491303108, "grad_norm": 0.893784839410386, "learning_rate": 3.844340842153823e-06, "loss": 0.0186, "step": 2059 }, { "epoch": 0.5873966353008269, "grad_norm": 0.742558705028012, "learning_rate": 3.839847766308607e-06, "loss": 0.0158, "step": 2060 }, { "epoch": 0.5876817792985458, "grad_norm": 1.8275272220434577, "learning_rate": 3.8353556803864324e-06, "loss": 0.0421, "step": 2061 }, { "epoch": 0.5879669232962647, "grad_norm": 0.4661961963077854, "learning_rate": 3.830864588220253e-06, "loss": 0.0086, "step": 2062 }, { "epoch": 0.5882520672939835, "grad_norm": 0.4600167908756378, "learning_rate": 3.826374493642187e-06, "loss": 0.0143, "step": 2063 }, { "epoch": 0.5885372112917023, "grad_norm": 0.9455308154957766, "learning_rate": 3.821885400483497e-06, "loss": 0.021, "step": 2064 }, { "epoch": 0.5888223552894212, "grad_norm": 0.7579095990471869, "learning_rate": 3.817397312574592e-06, "loss": 0.0143, "step": 2065 }, { "epoch": 0.58910749928714, "grad_norm": 1.4968058315352246, "learning_rate": 3.8129102337450236e-06, "loss": 0.0333, "step": 2066 }, { "epoch": 0.5893926432848589, "grad_norm": 1.1681439442987638, "learning_rate": 3.8084241678234777e-06, "loss": 0.0183, "step": 2067 }, { "epoch": 0.5896777872825777, "grad_norm": 1.0976297564753572, "learning_rate": 3.8039391186377806e-06, "loss": 0.0132, "step": 2068 }, { "epoch": 0.5899629312802965, "grad_norm": 1.8400386955643093, "learning_rate": 3.7994550900148914e-06, "loss": 0.0233, "step": 2069 }, { "epoch": 0.5902480752780154, "grad_norm": 0.8935945500146714, "learning_rate": 3.7949720857808953e-06, "loss": 0.0119, "step": 2070 }, { "epoch": 0.5905332192757342, "grad_norm": 1.0682714756646516, "learning_rate": 3.7904901097610074e-06, "loss": 0.0108, "step": 2071 }, { "epoch": 0.590818363273453, "grad_norm": 0.9744085122592351, "learning_rate": 3.786009165779563e-06, "loss": 0.0099, "step": 2072 }, { "epoch": 0.5911035072711719, "grad_norm": 1.3356235518474884, "learning_rate": 3.7815292576600143e-06, "loss": 0.0374, "step": 2073 }, { "epoch": 0.5913886512688908, "grad_norm": 2.052387547798954, "learning_rate": 3.777050389224933e-06, "loss": 0.0582, "step": 2074 }, { "epoch": 0.5916737952666097, "grad_norm": 1.021714477749963, "learning_rate": 3.7725725642960047e-06, "loss": 0.0175, "step": 2075 }, { "epoch": 0.5919589392643285, "grad_norm": 0.6765650754941559, "learning_rate": 3.768095786694021e-06, "loss": 0.0131, "step": 2076 }, { "epoch": 0.5922440832620474, "grad_norm": 0.5186344994748767, "learning_rate": 3.7636200602388855e-06, "loss": 0.0061, "step": 2077 }, { "epoch": 0.5925292272597662, "grad_norm": 0.6496702410485596, "learning_rate": 3.759145388749595e-06, "loss": 0.008, "step": 2078 }, { "epoch": 0.592814371257485, "grad_norm": 1.003400177949479, "learning_rate": 3.7546717760442565e-06, "loss": 0.0215, "step": 2079 }, { "epoch": 0.5930995152552039, "grad_norm": 1.4201821861215709, "learning_rate": 3.7501992259400665e-06, "loss": 0.0193, "step": 2080 }, { "epoch": 0.5933846592529227, "grad_norm": 1.7584435403151972, "learning_rate": 3.7457277422533213e-06, "loss": 0.03, "step": 2081 }, { "epoch": 0.5936698032506416, "grad_norm": 1.1121101189301186, "learning_rate": 3.7412573287994e-06, "loss": 0.0168, "step": 2082 }, { "epoch": 0.5939549472483604, "grad_norm": 0.9801459726894103, "learning_rate": 3.7367879893927763e-06, "loss": 0.0236, "step": 2083 }, { "epoch": 0.5942400912460792, "grad_norm": 0.9674593323408074, "learning_rate": 3.732319727846998e-06, "loss": 0.0179, "step": 2084 }, { "epoch": 0.5945252352437981, "grad_norm": 1.222026333353283, "learning_rate": 3.7278525479747014e-06, "loss": 0.0365, "step": 2085 }, { "epoch": 0.5948103792415169, "grad_norm": 2.025476959542898, "learning_rate": 3.7233864535875953e-06, "loss": 0.0373, "step": 2086 }, { "epoch": 0.5950955232392359, "grad_norm": 1.0776536635630782, "learning_rate": 3.7189214484964663e-06, "loss": 0.0215, "step": 2087 }, { "epoch": 0.5953806672369547, "grad_norm": 1.1705278143136957, "learning_rate": 3.7144575365111677e-06, "loss": 0.0114, "step": 2088 }, { "epoch": 0.5956658112346735, "grad_norm": 2.401216532684097, "learning_rate": 3.7099947214406217e-06, "loss": 0.032, "step": 2089 }, { "epoch": 0.5959509552323924, "grad_norm": 1.6998711218014348, "learning_rate": 3.705533007092812e-06, "loss": 0.0462, "step": 2090 }, { "epoch": 0.5962360992301112, "grad_norm": 1.5985776456990062, "learning_rate": 3.7010723972747887e-06, "loss": 0.045, "step": 2091 }, { "epoch": 0.5965212432278301, "grad_norm": 0.22163354454740036, "learning_rate": 3.6966128957926528e-06, "loss": 0.0035, "step": 2092 }, { "epoch": 0.5968063872255489, "grad_norm": 0.860485640046081, "learning_rate": 3.6921545064515667e-06, "loss": 0.0232, "step": 2093 }, { "epoch": 0.5970915312232677, "grad_norm": 0.19848014052903284, "learning_rate": 3.6876972330557393e-06, "loss": 0.0023, "step": 2094 }, { "epoch": 0.5973766752209866, "grad_norm": 1.194915635682168, "learning_rate": 3.6832410794084255e-06, "loss": 0.0232, "step": 2095 }, { "epoch": 0.5976618192187054, "grad_norm": 1.6969899400898916, "learning_rate": 3.6787860493119274e-06, "loss": 0.0479, "step": 2096 }, { "epoch": 0.5979469632164243, "grad_norm": 0.6961043811511708, "learning_rate": 3.6743321465675918e-06, "loss": 0.0059, "step": 2097 }, { "epoch": 0.5982321072141431, "grad_norm": 1.5504097165767623, "learning_rate": 3.669879374975796e-06, "loss": 0.0533, "step": 2098 }, { "epoch": 0.598517251211862, "grad_norm": 0.6225399533823082, "learning_rate": 3.6654277383359613e-06, "loss": 0.0241, "step": 2099 }, { "epoch": 0.5988023952095808, "grad_norm": 0.8199738615224776, "learning_rate": 3.6609772404465293e-06, "loss": 0.0153, "step": 2100 }, { "epoch": 0.5990875392072997, "grad_norm": 2.6367146702980135, "learning_rate": 3.6565278851049803e-06, "loss": 0.0232, "step": 2101 }, { "epoch": 0.5993726832050186, "grad_norm": 2.218376029082567, "learning_rate": 3.6520796761078126e-06, "loss": 0.0264, "step": 2102 }, { "epoch": 0.5996578272027374, "grad_norm": 0.42063773687844663, "learning_rate": 3.6476326172505516e-06, "loss": 0.0077, "step": 2103 }, { "epoch": 0.5999429712004563, "grad_norm": 1.4846079891375457, "learning_rate": 3.6431867123277393e-06, "loss": 0.0274, "step": 2104 }, { "epoch": 0.6002281151981751, "grad_norm": 2.0672737709470628, "learning_rate": 3.6387419651329326e-06, "loss": 0.0252, "step": 2105 }, { "epoch": 0.6005132591958939, "grad_norm": 1.3500653562887743, "learning_rate": 3.6342983794586974e-06, "loss": 0.0152, "step": 2106 }, { "epoch": 0.6007984031936128, "grad_norm": 1.7611157268739988, "learning_rate": 3.6298559590966153e-06, "loss": 0.0286, "step": 2107 }, { "epoch": 0.6010835471913316, "grad_norm": 1.2728775668088093, "learning_rate": 3.625414707837268e-06, "loss": 0.0179, "step": 2108 }, { "epoch": 0.6013686911890505, "grad_norm": 0.9960972880229151, "learning_rate": 3.6209746294702442e-06, "loss": 0.0232, "step": 2109 }, { "epoch": 0.6016538351867693, "grad_norm": 0.6157173200508199, "learning_rate": 3.6165357277841294e-06, "loss": 0.0134, "step": 2110 }, { "epoch": 0.6019389791844881, "grad_norm": 0.8400067311463636, "learning_rate": 3.6120980065665023e-06, "loss": 0.0112, "step": 2111 }, { "epoch": 0.602224123182207, "grad_norm": 1.2339712983273081, "learning_rate": 3.607661469603937e-06, "loss": 0.0239, "step": 2112 }, { "epoch": 0.6025092671799258, "grad_norm": 1.5580285145203188, "learning_rate": 3.6032261206819995e-06, "loss": 0.0412, "step": 2113 }, { "epoch": 0.6027944111776448, "grad_norm": 1.1722662736059852, "learning_rate": 3.598791963585239e-06, "loss": 0.0152, "step": 2114 }, { "epoch": 0.6030795551753636, "grad_norm": 1.2912381233776336, "learning_rate": 3.5943590020971873e-06, "loss": 0.0272, "step": 2115 }, { "epoch": 0.6033646991730824, "grad_norm": 2.1137858949289448, "learning_rate": 3.5899272400003603e-06, "loss": 0.0288, "step": 2116 }, { "epoch": 0.6036498431708013, "grad_norm": 0.40725160937380805, "learning_rate": 3.585496681076244e-06, "loss": 0.005, "step": 2117 }, { "epoch": 0.6039349871685201, "grad_norm": 1.069151927238274, "learning_rate": 3.581067329105301e-06, "loss": 0.0286, "step": 2118 }, { "epoch": 0.604220131166239, "grad_norm": 1.6223267876797507, "learning_rate": 3.5766391878669676e-06, "loss": 0.028, "step": 2119 }, { "epoch": 0.6045052751639578, "grad_norm": 0.9274658583181319, "learning_rate": 3.5722122611396416e-06, "loss": 0.0124, "step": 2120 }, { "epoch": 0.6047904191616766, "grad_norm": 1.280180345583819, "learning_rate": 3.5677865527006876e-06, "loss": 0.0377, "step": 2121 }, { "epoch": 0.6050755631593955, "grad_norm": 1.4904942621790036, "learning_rate": 3.563362066326427e-06, "loss": 0.0263, "step": 2122 }, { "epoch": 0.6053607071571143, "grad_norm": 1.2524362229042842, "learning_rate": 3.5589388057921435e-06, "loss": 0.0267, "step": 2123 }, { "epoch": 0.6056458511548332, "grad_norm": 4.061187413865505, "learning_rate": 3.5545167748720705e-06, "loss": 0.0713, "step": 2124 }, { "epoch": 0.605930995152552, "grad_norm": 0.5362589492288503, "learning_rate": 3.550095977339396e-06, "loss": 0.0094, "step": 2125 }, { "epoch": 0.6062161391502708, "grad_norm": 0.8432265034751789, "learning_rate": 3.545676416966254e-06, "loss": 0.0184, "step": 2126 }, { "epoch": 0.6065012831479898, "grad_norm": 1.3167325005559367, "learning_rate": 3.541258097523722e-06, "loss": 0.0289, "step": 2127 }, { "epoch": 0.6067864271457086, "grad_norm": 1.0810373994278544, "learning_rate": 3.536841022781816e-06, "loss": 0.019, "step": 2128 }, { "epoch": 0.6070715711434275, "grad_norm": 1.2837062613842711, "learning_rate": 3.532425196509498e-06, "loss": 0.0268, "step": 2129 }, { "epoch": 0.6073567151411463, "grad_norm": 1.1490746860147354, "learning_rate": 3.5280106224746575e-06, "loss": 0.0251, "step": 2130 }, { "epoch": 0.6076418591388651, "grad_norm": 1.0489665093288734, "learning_rate": 3.5235973044441163e-06, "loss": 0.018, "step": 2131 }, { "epoch": 0.607927003136584, "grad_norm": 1.726137132954225, "learning_rate": 3.5191852461836306e-06, "loss": 0.0241, "step": 2132 }, { "epoch": 0.6082121471343028, "grad_norm": 0.7090919389506171, "learning_rate": 3.514774451457873e-06, "loss": 0.0186, "step": 2133 }, { "epoch": 0.6084972911320217, "grad_norm": 0.4590722376042221, "learning_rate": 3.510364924030443e-06, "loss": 0.0097, "step": 2134 }, { "epoch": 0.6087824351297405, "grad_norm": 1.1018085553098087, "learning_rate": 3.505956667663859e-06, "loss": 0.015, "step": 2135 }, { "epoch": 0.6090675791274593, "grad_norm": 0.6254529574769264, "learning_rate": 3.5015496861195526e-06, "loss": 0.0135, "step": 2136 }, { "epoch": 0.6093527231251782, "grad_norm": 1.6428276895681884, "learning_rate": 3.497143983157868e-06, "loss": 0.0266, "step": 2137 }, { "epoch": 0.609637867122897, "grad_norm": 1.1906175686284322, "learning_rate": 3.4927395625380626e-06, "loss": 0.0403, "step": 2138 }, { "epoch": 0.6099230111206159, "grad_norm": 0.7327641862389733, "learning_rate": 3.488336428018293e-06, "loss": 0.0166, "step": 2139 }, { "epoch": 0.6102081551183347, "grad_norm": 1.8597660390527846, "learning_rate": 3.4839345833556217e-06, "loss": 0.0324, "step": 2140 }, { "epoch": 0.6104932991160537, "grad_norm": 3.452549046142695, "learning_rate": 3.479534032306011e-06, "loss": 0.0538, "step": 2141 }, { "epoch": 0.6107784431137725, "grad_norm": 1.2114221044219349, "learning_rate": 3.4751347786243193e-06, "loss": 0.0152, "step": 2142 }, { "epoch": 0.6110635871114913, "grad_norm": 1.2721385499066669, "learning_rate": 3.470736826064299e-06, "loss": 0.0175, "step": 2143 }, { "epoch": 0.6113487311092102, "grad_norm": 1.6293019383768632, "learning_rate": 3.4663401783785865e-06, "loss": 0.0483, "step": 2144 }, { "epoch": 0.611633875106929, "grad_norm": 1.445639177748986, "learning_rate": 3.4619448393187126e-06, "loss": 0.0155, "step": 2145 }, { "epoch": 0.6119190191046479, "grad_norm": 1.0315630583680009, "learning_rate": 3.4575508126350875e-06, "loss": 0.0126, "step": 2146 }, { "epoch": 0.6122041631023667, "grad_norm": 0.7398009312525474, "learning_rate": 3.453158102077001e-06, "loss": 0.0229, "step": 2147 }, { "epoch": 0.6124893071000855, "grad_norm": 0.3557371653456857, "learning_rate": 3.4487667113926226e-06, "loss": 0.0115, "step": 2148 }, { "epoch": 0.6127744510978044, "grad_norm": 1.2762208256712575, "learning_rate": 3.4443766443289948e-06, "loss": 0.0246, "step": 2149 }, { "epoch": 0.6130595950955232, "grad_norm": 1.3646703557027133, "learning_rate": 3.439987904632026e-06, "loss": 0.0311, "step": 2150 }, { "epoch": 0.613344739093242, "grad_norm": 1.2229542617252571, "learning_rate": 3.4356004960464994e-06, "loss": 0.0227, "step": 2151 }, { "epoch": 0.6136298830909609, "grad_norm": 2.3224264173489786, "learning_rate": 3.431214422316057e-06, "loss": 0.0362, "step": 2152 }, { "epoch": 0.6139150270886797, "grad_norm": 0.709410518708107, "learning_rate": 3.426829687183204e-06, "loss": 0.0114, "step": 2153 }, { "epoch": 0.6142001710863987, "grad_norm": 1.0413949214912819, "learning_rate": 3.4224462943893057e-06, "loss": 0.0236, "step": 2154 }, { "epoch": 0.6144853150841175, "grad_norm": 0.7296747342861363, "learning_rate": 3.418064247674576e-06, "loss": 0.0112, "step": 2155 }, { "epoch": 0.6147704590818364, "grad_norm": 1.2435332019082928, "learning_rate": 3.413683550778084e-06, "loss": 0.0158, "step": 2156 }, { "epoch": 0.6150556030795552, "grad_norm": 1.0875315186112393, "learning_rate": 3.409304207437749e-06, "loss": 0.0194, "step": 2157 }, { "epoch": 0.615340747077274, "grad_norm": 1.5282693808083203, "learning_rate": 3.404926221390332e-06, "loss": 0.0218, "step": 2158 }, { "epoch": 0.6156258910749929, "grad_norm": 0.474632274589884, "learning_rate": 3.400549596371435e-06, "loss": 0.0117, "step": 2159 }, { "epoch": 0.6159110350727117, "grad_norm": 0.7581340345773641, "learning_rate": 3.3961743361155055e-06, "loss": 0.0197, "step": 2160 }, { "epoch": 0.6161961790704306, "grad_norm": 1.3727372919022707, "learning_rate": 3.3918004443558163e-06, "loss": 0.0563, "step": 2161 }, { "epoch": 0.6164813230681494, "grad_norm": 2.2648471754423496, "learning_rate": 3.3874279248244803e-06, "loss": 0.0327, "step": 2162 }, { "epoch": 0.6167664670658682, "grad_norm": 0.7505272987405108, "learning_rate": 3.383056781252435e-06, "loss": 0.013, "step": 2163 }, { "epoch": 0.6170516110635871, "grad_norm": 1.1079848809501045, "learning_rate": 3.3786870173694497e-06, "loss": 0.0127, "step": 2164 }, { "epoch": 0.6173367550613059, "grad_norm": 2.0002129754495295, "learning_rate": 3.37431863690411e-06, "loss": 0.0347, "step": 2165 }, { "epoch": 0.6176218990590248, "grad_norm": 1.5327965290014722, "learning_rate": 3.369951643583823e-06, "loss": 0.0307, "step": 2166 }, { "epoch": 0.6179070430567437, "grad_norm": 2.027574254434053, "learning_rate": 3.365586041134815e-06, "loss": 0.0292, "step": 2167 }, { "epoch": 0.6181921870544625, "grad_norm": 1.1355125852210322, "learning_rate": 3.361221833282122e-06, "loss": 0.0182, "step": 2168 }, { "epoch": 0.6184773310521814, "grad_norm": 1.0205244715021804, "learning_rate": 3.3568590237495912e-06, "loss": 0.0339, "step": 2169 }, { "epoch": 0.6187624750499002, "grad_norm": 0.9286220091766371, "learning_rate": 3.3524976162598777e-06, "loss": 0.0223, "step": 2170 }, { "epoch": 0.6190476190476191, "grad_norm": 0.6108615934396132, "learning_rate": 3.34813761453444e-06, "loss": 0.0103, "step": 2171 }, { "epoch": 0.6193327630453379, "grad_norm": 1.2988343838308507, "learning_rate": 3.343779022293536e-06, "loss": 0.0301, "step": 2172 }, { "epoch": 0.6196179070430567, "grad_norm": 0.3341719225290523, "learning_rate": 3.3394218432562185e-06, "loss": 0.0082, "step": 2173 }, { "epoch": 0.6199030510407756, "grad_norm": 1.6943619739632687, "learning_rate": 3.3350660811403425e-06, "loss": 0.0294, "step": 2174 }, { "epoch": 0.6201881950384944, "grad_norm": 1.1528385894622213, "learning_rate": 3.330711739662545e-06, "loss": 0.0187, "step": 2175 }, { "epoch": 0.6204733390362133, "grad_norm": 1.4611973629167543, "learning_rate": 3.326358822538258e-06, "loss": 0.0402, "step": 2176 }, { "epoch": 0.6207584830339321, "grad_norm": 1.7876364426505764, "learning_rate": 3.3220073334816928e-06, "loss": 0.0326, "step": 2177 }, { "epoch": 0.621043627031651, "grad_norm": 0.7956538149927447, "learning_rate": 3.3176572762058434e-06, "loss": 0.0146, "step": 2178 }, { "epoch": 0.6213287710293698, "grad_norm": 1.3226414882028097, "learning_rate": 3.313308654422484e-06, "loss": 0.0285, "step": 2179 }, { "epoch": 0.6216139150270886, "grad_norm": 2.1614175148958616, "learning_rate": 3.3089614718421635e-06, "loss": 0.0377, "step": 2180 }, { "epoch": 0.6218990590248076, "grad_norm": 0.9471207974194648, "learning_rate": 3.304615732174201e-06, "loss": 0.0145, "step": 2181 }, { "epoch": 0.6221842030225264, "grad_norm": 1.3292435098511863, "learning_rate": 3.300271439126689e-06, "loss": 0.0234, "step": 2182 }, { "epoch": 0.6224693470202453, "grad_norm": 0.706611462258107, "learning_rate": 3.2959285964064776e-06, "loss": 0.0147, "step": 2183 }, { "epoch": 0.6227544910179641, "grad_norm": 3.000133668041504, "learning_rate": 3.291587207719186e-06, "loss": 0.0514, "step": 2184 }, { "epoch": 0.6230396350156829, "grad_norm": 2.2778332807998405, "learning_rate": 3.2872472767691894e-06, "loss": 0.0488, "step": 2185 }, { "epoch": 0.6233247790134018, "grad_norm": 1.1901143222547017, "learning_rate": 3.2829088072596226e-06, "loss": 0.0263, "step": 2186 }, { "epoch": 0.6236099230111206, "grad_norm": 0.8397072051957538, "learning_rate": 3.2785718028923715e-06, "loss": 0.0264, "step": 2187 }, { "epoch": 0.6238950670088395, "grad_norm": 0.38604583860305863, "learning_rate": 3.2742362673680687e-06, "loss": 0.0074, "step": 2188 }, { "epoch": 0.6241802110065583, "grad_norm": 1.947982342419954, "learning_rate": 3.2699022043860973e-06, "loss": 0.0267, "step": 2189 }, { "epoch": 0.6244653550042771, "grad_norm": 1.1626686035488216, "learning_rate": 3.2655696176445852e-06, "loss": 0.0229, "step": 2190 }, { "epoch": 0.624750499001996, "grad_norm": 1.6043946457619445, "learning_rate": 3.2612385108403955e-06, "loss": 0.0382, "step": 2191 }, { "epoch": 0.6250356429997148, "grad_norm": 1.8055044074507538, "learning_rate": 3.256908887669134e-06, "loss": 0.0276, "step": 2192 }, { "epoch": 0.6253207869974337, "grad_norm": 1.6754046006759364, "learning_rate": 3.2525807518251386e-06, "loss": 0.0408, "step": 2193 }, { "epoch": 0.6256059309951526, "grad_norm": 0.9463034307019634, "learning_rate": 3.248254107001474e-06, "loss": 0.0175, "step": 2194 }, { "epoch": 0.6258910749928714, "grad_norm": 2.175470555795851, "learning_rate": 3.243928956889938e-06, "loss": 0.0278, "step": 2195 }, { "epoch": 0.6261762189905903, "grad_norm": 0.6729073385440583, "learning_rate": 3.2396053051810515e-06, "loss": 0.0285, "step": 2196 }, { "epoch": 0.6264613629883091, "grad_norm": 1.571784137465903, "learning_rate": 3.2352831555640563e-06, "loss": 0.0309, "step": 2197 }, { "epoch": 0.626746506986028, "grad_norm": 1.1661769802553186, "learning_rate": 3.230962511726915e-06, "loss": 0.025, "step": 2198 }, { "epoch": 0.6270316509837468, "grad_norm": 0.42123213663257564, "learning_rate": 3.2266433773563e-06, "loss": 0.0115, "step": 2199 }, { "epoch": 0.6273167949814656, "grad_norm": 1.8338109807568292, "learning_rate": 3.222325756137599e-06, "loss": 0.0367, "step": 2200 }, { "epoch": 0.6276019389791845, "grad_norm": 1.1277887251821224, "learning_rate": 3.21800965175491e-06, "loss": 0.0311, "step": 2201 }, { "epoch": 0.6278870829769033, "grad_norm": 1.201306830776855, "learning_rate": 3.213695067891034e-06, "loss": 0.022, "step": 2202 }, { "epoch": 0.6281722269746222, "grad_norm": 1.2570922828383633, "learning_rate": 3.2093820082274763e-06, "loss": 0.0193, "step": 2203 }, { "epoch": 0.628457370972341, "grad_norm": 1.5186675511953927, "learning_rate": 3.2050704764444433e-06, "loss": 0.0441, "step": 2204 }, { "epoch": 0.6287425149700598, "grad_norm": 0.6524174178550722, "learning_rate": 3.200760476220831e-06, "loss": 0.0268, "step": 2205 }, { "epoch": 0.6290276589677787, "grad_norm": 0.8724476437379903, "learning_rate": 3.1964520112342363e-06, "loss": 0.0223, "step": 2206 }, { "epoch": 0.6293128029654976, "grad_norm": 1.4186863998600603, "learning_rate": 3.1921450851609404e-06, "loss": 0.0392, "step": 2207 }, { "epoch": 0.6295979469632165, "grad_norm": 1.336564669853662, "learning_rate": 3.187839701675917e-06, "loss": 0.0222, "step": 2208 }, { "epoch": 0.6298830909609353, "grad_norm": 1.6468050869178028, "learning_rate": 3.18353586445282e-06, "loss": 0.037, "step": 2209 }, { "epoch": 0.6301682349586541, "grad_norm": 1.841124429536788, "learning_rate": 3.1792335771639827e-06, "loss": 0.0394, "step": 2210 }, { "epoch": 0.630453378956373, "grad_norm": 1.5844988548472232, "learning_rate": 3.174932843480416e-06, "loss": 0.0397, "step": 2211 }, { "epoch": 0.6307385229540918, "grad_norm": 2.0069101566372085, "learning_rate": 3.1706336670718106e-06, "loss": 0.0501, "step": 2212 }, { "epoch": 0.6310236669518107, "grad_norm": 0.9427410695473213, "learning_rate": 3.166336051606521e-06, "loss": 0.0231, "step": 2213 }, { "epoch": 0.6313088109495295, "grad_norm": 0.9212663531499328, "learning_rate": 3.1620400007515772e-06, "loss": 0.0114, "step": 2214 }, { "epoch": 0.6315939549472483, "grad_norm": 0.8808245598066381, "learning_rate": 3.157745518172669e-06, "loss": 0.0078, "step": 2215 }, { "epoch": 0.6318790989449672, "grad_norm": 1.0702460227921857, "learning_rate": 3.153452607534147e-06, "loss": 0.02, "step": 2216 }, { "epoch": 0.632164242942686, "grad_norm": 1.2534625169103417, "learning_rate": 3.149161272499024e-06, "loss": 0.0374, "step": 2217 }, { "epoch": 0.6324493869404049, "grad_norm": 1.5261888629053173, "learning_rate": 3.1448715167289677e-06, "loss": 0.0435, "step": 2218 }, { "epoch": 0.6327345309381237, "grad_norm": 1.2002510268158786, "learning_rate": 3.140583343884298e-06, "loss": 0.0183, "step": 2219 }, { "epoch": 0.6330196749358425, "grad_norm": 1.0985692409600272, "learning_rate": 3.1362967576239854e-06, "loss": 0.0262, "step": 2220 }, { "epoch": 0.6333048189335615, "grad_norm": 1.2072207242963418, "learning_rate": 3.1320117616056413e-06, "loss": 0.0314, "step": 2221 }, { "epoch": 0.6335899629312803, "grad_norm": 0.467015398806509, "learning_rate": 3.1277283594855267e-06, "loss": 0.0106, "step": 2222 }, { "epoch": 0.6338751069289992, "grad_norm": 0.8807024802610426, "learning_rate": 3.123446554918538e-06, "loss": 0.011, "step": 2223 }, { "epoch": 0.634160250926718, "grad_norm": 0.6140299365612738, "learning_rate": 3.1191663515582127e-06, "loss": 0.0248, "step": 2224 }, { "epoch": 0.6344453949244369, "grad_norm": 1.0559654324827494, "learning_rate": 3.1148877530567177e-06, "loss": 0.0183, "step": 2225 }, { "epoch": 0.6347305389221557, "grad_norm": 1.0540351749712773, "learning_rate": 3.1106107630648574e-06, "loss": 0.0261, "step": 2226 }, { "epoch": 0.6350156829198745, "grad_norm": 0.9313773695645087, "learning_rate": 3.106335385232051e-06, "loss": 0.0292, "step": 2227 }, { "epoch": 0.6353008269175934, "grad_norm": 0.3747808238118342, "learning_rate": 3.102061623206355e-06, "loss": 0.0094, "step": 2228 }, { "epoch": 0.6355859709153122, "grad_norm": 2.3312916926506504, "learning_rate": 3.0977894806344406e-06, "loss": 0.0527, "step": 2229 }, { "epoch": 0.635871114913031, "grad_norm": 0.6324565323611943, "learning_rate": 3.093518961161599e-06, "loss": 0.0095, "step": 2230 }, { "epoch": 0.6361562589107499, "grad_norm": 1.590518034839, "learning_rate": 3.0892500684317386e-06, "loss": 0.0244, "step": 2231 }, { "epoch": 0.6364414029084687, "grad_norm": 0.7308543148733039, "learning_rate": 3.084982806087372e-06, "loss": 0.024, "step": 2232 }, { "epoch": 0.6367265469061876, "grad_norm": 1.248369277983139, "learning_rate": 3.080717177769629e-06, "loss": 0.0231, "step": 2233 }, { "epoch": 0.6370116909039065, "grad_norm": 0.9686441748388779, "learning_rate": 3.0764531871182422e-06, "loss": 0.016, "step": 2234 }, { "epoch": 0.6372968349016254, "grad_norm": 1.0082773282212865, "learning_rate": 3.072190837771546e-06, "loss": 0.0175, "step": 2235 }, { "epoch": 0.6375819788993442, "grad_norm": 1.4311529725243861, "learning_rate": 3.067930133366476e-06, "loss": 0.0319, "step": 2236 }, { "epoch": 0.637867122897063, "grad_norm": 0.8406777760925859, "learning_rate": 3.0636710775385635e-06, "loss": 0.0108, "step": 2237 }, { "epoch": 0.6381522668947819, "grad_norm": 0.878282487364186, "learning_rate": 3.059413673921931e-06, "loss": 0.0193, "step": 2238 }, { "epoch": 0.6384374108925007, "grad_norm": 0.9502594240301524, "learning_rate": 3.055157926149293e-06, "loss": 0.0203, "step": 2239 }, { "epoch": 0.6387225548902196, "grad_norm": 1.308186958668311, "learning_rate": 3.050903837851953e-06, "loss": 0.023, "step": 2240 }, { "epoch": 0.6390076988879384, "grad_norm": 0.44428606044631475, "learning_rate": 3.0466514126597945e-06, "loss": 0.014, "step": 2241 }, { "epoch": 0.6392928428856572, "grad_norm": 1.209699851720728, "learning_rate": 3.0424006542012897e-06, "loss": 0.0103, "step": 2242 }, { "epoch": 0.6395779868833761, "grad_norm": 0.9514393878467423, "learning_rate": 3.038151566103475e-06, "loss": 0.0147, "step": 2243 }, { "epoch": 0.6398631308810949, "grad_norm": 1.9454407250073225, "learning_rate": 3.0339041519919745e-06, "loss": 0.042, "step": 2244 }, { "epoch": 0.6401482748788138, "grad_norm": 0.728850597077105, "learning_rate": 3.029658415490977e-06, "loss": 0.0124, "step": 2245 }, { "epoch": 0.6404334188765326, "grad_norm": 1.1288659293589929, "learning_rate": 3.0254143602232434e-06, "loss": 0.0197, "step": 2246 }, { "epoch": 0.6407185628742516, "grad_norm": 1.1475249422099956, "learning_rate": 3.021171989810099e-06, "loss": 0.0211, "step": 2247 }, { "epoch": 0.6410037068719704, "grad_norm": 1.2132830355724133, "learning_rate": 3.0169313078714296e-06, "loss": 0.0293, "step": 2248 }, { "epoch": 0.6412888508696892, "grad_norm": 1.9363856482086057, "learning_rate": 3.0126923180256806e-06, "loss": 0.0243, "step": 2249 }, { "epoch": 0.6415739948674081, "grad_norm": 0.7703844388099039, "learning_rate": 3.008455023889857e-06, "loss": 0.0121, "step": 2250 }, { "epoch": 0.6418591388651269, "grad_norm": 1.3134506657893854, "learning_rate": 3.0042194290795123e-06, "loss": 0.0318, "step": 2251 }, { "epoch": 0.6421442828628457, "grad_norm": 1.4692545047359329, "learning_rate": 2.999985537208755e-06, "loss": 0.0176, "step": 2252 }, { "epoch": 0.6424294268605646, "grad_norm": 0.9828572979175608, "learning_rate": 2.9957533518902376e-06, "loss": 0.0196, "step": 2253 }, { "epoch": 0.6427145708582834, "grad_norm": 0.7049658553143684, "learning_rate": 2.991522876735154e-06, "loss": 0.0229, "step": 2254 }, { "epoch": 0.6429997148560023, "grad_norm": 0.842198069759342, "learning_rate": 2.987294115353242e-06, "loss": 0.0128, "step": 2255 }, { "epoch": 0.6432848588537211, "grad_norm": 1.7227804347359337, "learning_rate": 2.9830670713527786e-06, "loss": 0.0335, "step": 2256 }, { "epoch": 0.64357000285144, "grad_norm": 0.3681229418460075, "learning_rate": 2.9788417483405716e-06, "loss": 0.0083, "step": 2257 }, { "epoch": 0.6438551468491588, "grad_norm": 1.1818461840946088, "learning_rate": 2.9746181499219627e-06, "loss": 0.0154, "step": 2258 }, { "epoch": 0.6441402908468776, "grad_norm": 1.411740982709478, "learning_rate": 2.970396279700824e-06, "loss": 0.0184, "step": 2259 }, { "epoch": 0.6444254348445965, "grad_norm": 2.5844309546840716, "learning_rate": 2.9661761412795465e-06, "loss": 0.0378, "step": 2260 }, { "epoch": 0.6447105788423154, "grad_norm": 1.5612725743924631, "learning_rate": 2.9619577382590485e-06, "loss": 0.0203, "step": 2261 }, { "epoch": 0.6449957228400343, "grad_norm": 1.244411728256585, "learning_rate": 2.9577410742387686e-06, "loss": 0.0179, "step": 2262 }, { "epoch": 0.6452808668377531, "grad_norm": 1.920069290541119, "learning_rate": 2.9535261528166577e-06, "loss": 0.0264, "step": 2263 }, { "epoch": 0.6455660108354719, "grad_norm": 0.9562289133021995, "learning_rate": 2.949312977589181e-06, "loss": 0.0177, "step": 2264 }, { "epoch": 0.6458511548331908, "grad_norm": 2.534580020676373, "learning_rate": 2.945101552151317e-06, "loss": 0.0305, "step": 2265 }, { "epoch": 0.6461362988309096, "grad_norm": 1.3341099226810462, "learning_rate": 2.9408918800965464e-06, "loss": 0.0158, "step": 2266 }, { "epoch": 0.6464214428286285, "grad_norm": 1.9949800798700283, "learning_rate": 2.936683965016855e-06, "loss": 0.0403, "step": 2267 }, { "epoch": 0.6467065868263473, "grad_norm": 0.9844526919586044, "learning_rate": 2.9324778105027323e-06, "loss": 0.0137, "step": 2268 }, { "epoch": 0.6469917308240661, "grad_norm": 1.0381005934303287, "learning_rate": 2.9282734201431627e-06, "loss": 0.0285, "step": 2269 }, { "epoch": 0.647276874821785, "grad_norm": 1.2860946237813613, "learning_rate": 2.924070797525628e-06, "loss": 0.025, "step": 2270 }, { "epoch": 0.6475620188195038, "grad_norm": 1.3340813057644438, "learning_rate": 2.919869946236096e-06, "loss": 0.0278, "step": 2271 }, { "epoch": 0.6478471628172227, "grad_norm": 1.6573747957672138, "learning_rate": 2.9156708698590273e-06, "loss": 0.0301, "step": 2272 }, { "epoch": 0.6481323068149415, "grad_norm": 0.868842113331832, "learning_rate": 2.9114735719773718e-06, "loss": 0.0206, "step": 2273 }, { "epoch": 0.6484174508126604, "grad_norm": 0.7243820586100654, "learning_rate": 2.9072780561725543e-06, "loss": 0.0167, "step": 2274 }, { "epoch": 0.6487025948103793, "grad_norm": 1.185198168923413, "learning_rate": 2.9030843260244834e-06, "loss": 0.023, "step": 2275 }, { "epoch": 0.6489877388080981, "grad_norm": 0.6293907385655024, "learning_rate": 2.8988923851115425e-06, "loss": 0.0099, "step": 2276 }, { "epoch": 0.649272882805817, "grad_norm": 2.0445266570057714, "learning_rate": 2.894702237010589e-06, "loss": 0.0186, "step": 2277 }, { "epoch": 0.6495580268035358, "grad_norm": 2.047384698677883, "learning_rate": 2.8905138852969507e-06, "loss": 0.0268, "step": 2278 }, { "epoch": 0.6498431708012546, "grad_norm": 1.0653517095398133, "learning_rate": 2.886327333544421e-06, "loss": 0.0161, "step": 2279 }, { "epoch": 0.6501283147989735, "grad_norm": 1.5006183561890554, "learning_rate": 2.8821425853252603e-06, "loss": 0.035, "step": 2280 }, { "epoch": 0.6504134587966923, "grad_norm": 0.937416176228414, "learning_rate": 2.8779596442101878e-06, "loss": 0.0229, "step": 2281 }, { "epoch": 0.6506986027944112, "grad_norm": 1.4517724372844274, "learning_rate": 2.8737785137683815e-06, "loss": 0.027, "step": 2282 }, { "epoch": 0.65098374679213, "grad_norm": 1.7278224774601127, "learning_rate": 2.8695991975674735e-06, "loss": 0.0237, "step": 2283 }, { "epoch": 0.6512688907898488, "grad_norm": 1.5785313959183065, "learning_rate": 2.8654216991735504e-06, "loss": 0.0296, "step": 2284 }, { "epoch": 0.6515540347875677, "grad_norm": 1.0919831115191292, "learning_rate": 2.861246022151143e-06, "loss": 0.02, "step": 2285 }, { "epoch": 0.6518391787852865, "grad_norm": 0.7275158367500263, "learning_rate": 2.8570721700632354e-06, "loss": 0.0116, "step": 2286 }, { "epoch": 0.6521243227830055, "grad_norm": 1.3129785911100587, "learning_rate": 2.852900146471249e-06, "loss": 0.0154, "step": 2287 }, { "epoch": 0.6524094667807243, "grad_norm": 1.6226425506929618, "learning_rate": 2.848729954935042e-06, "loss": 0.019, "step": 2288 }, { "epoch": 0.6526946107784432, "grad_norm": 1.7305587988312543, "learning_rate": 2.844561599012918e-06, "loss": 0.0267, "step": 2289 }, { "epoch": 0.652979754776162, "grad_norm": 0.8143957160470554, "learning_rate": 2.8403950822616088e-06, "loss": 0.0085, "step": 2290 }, { "epoch": 0.6532648987738808, "grad_norm": 1.1122531151983972, "learning_rate": 2.836230408236278e-06, "loss": 0.0215, "step": 2291 }, { "epoch": 0.6535500427715997, "grad_norm": 0.4083282299767834, "learning_rate": 2.832067580490516e-06, "loss": 0.0063, "step": 2292 }, { "epoch": 0.6538351867693185, "grad_norm": 0.5640507716536349, "learning_rate": 2.827906602576339e-06, "loss": 0.0087, "step": 2293 }, { "epoch": 0.6541203307670374, "grad_norm": 1.3067961879293044, "learning_rate": 2.823747478044185e-06, "loss": 0.0138, "step": 2294 }, { "epoch": 0.6544054747647562, "grad_norm": 1.0110478159071878, "learning_rate": 2.8195902104429084e-06, "loss": 0.0298, "step": 2295 }, { "epoch": 0.654690618762475, "grad_norm": 0.5904570142107487, "learning_rate": 2.815434803319783e-06, "loss": 0.0111, "step": 2296 }, { "epoch": 0.6549757627601939, "grad_norm": 0.245921457034098, "learning_rate": 2.8112812602204885e-06, "loss": 0.0026, "step": 2297 }, { "epoch": 0.6552609067579127, "grad_norm": 0.7085877553818561, "learning_rate": 2.8071295846891256e-06, "loss": 0.0093, "step": 2298 }, { "epoch": 0.6555460507556315, "grad_norm": 1.6452998075190501, "learning_rate": 2.802979780268188e-06, "loss": 0.0244, "step": 2299 }, { "epoch": 0.6558311947533504, "grad_norm": 0.6904886542892218, "learning_rate": 2.7988318504985817e-06, "loss": 0.0174, "step": 2300 }, { "epoch": 0.6561163387510693, "grad_norm": 0.5850968737884185, "learning_rate": 2.7946857989196076e-06, "loss": 0.0069, "step": 2301 }, { "epoch": 0.6564014827487882, "grad_norm": 1.8185293469917627, "learning_rate": 2.7905416290689717e-06, "loss": 0.0412, "step": 2302 }, { "epoch": 0.656686626746507, "grad_norm": 0.8965190611007764, "learning_rate": 2.7863993444827697e-06, "loss": 0.0226, "step": 2303 }, { "epoch": 0.6569717707442259, "grad_norm": 1.3182053225055796, "learning_rate": 2.782258948695481e-06, "loss": 0.0175, "step": 2304 }, { "epoch": 0.6572569147419447, "grad_norm": 0.9339402915101415, "learning_rate": 2.778120445239989e-06, "loss": 0.0122, "step": 2305 }, { "epoch": 0.6575420587396635, "grad_norm": 1.6156977930324048, "learning_rate": 2.773983837647551e-06, "loss": 0.0389, "step": 2306 }, { "epoch": 0.6578272027373824, "grad_norm": 0.6651074350777881, "learning_rate": 2.76984912944781e-06, "loss": 0.0146, "step": 2307 }, { "epoch": 0.6581123467351012, "grad_norm": 1.8452127520754087, "learning_rate": 2.765716324168789e-06, "loss": 0.0227, "step": 2308 }, { "epoch": 0.6583974907328201, "grad_norm": 0.9652739173619617, "learning_rate": 2.761585425336886e-06, "loss": 0.0267, "step": 2309 }, { "epoch": 0.6586826347305389, "grad_norm": 1.2330026270001035, "learning_rate": 2.757456436476873e-06, "loss": 0.023, "step": 2310 }, { "epoch": 0.6589677787282577, "grad_norm": 0.9565278016472557, "learning_rate": 2.7533293611118923e-06, "loss": 0.0131, "step": 2311 }, { "epoch": 0.6592529227259766, "grad_norm": 0.5029893671434901, "learning_rate": 2.7492042027634525e-06, "loss": 0.0074, "step": 2312 }, { "epoch": 0.6595380667236954, "grad_norm": 1.6048936560011449, "learning_rate": 2.7450809649514265e-06, "loss": 0.048, "step": 2313 }, { "epoch": 0.6598232107214144, "grad_norm": 2.4275543233286503, "learning_rate": 2.740959651194054e-06, "loss": 0.0222, "step": 2314 }, { "epoch": 0.6601083547191332, "grad_norm": 2.8531321009807247, "learning_rate": 2.7368402650079228e-06, "loss": 0.0597, "step": 2315 }, { "epoch": 0.660393498716852, "grad_norm": 1.4630738141417063, "learning_rate": 2.7327228099079826e-06, "loss": 0.0229, "step": 2316 }, { "epoch": 0.6606786427145709, "grad_norm": 0.6779278399548759, "learning_rate": 2.728607289407534e-06, "loss": 0.0069, "step": 2317 }, { "epoch": 0.6609637867122897, "grad_norm": 2.398616876487329, "learning_rate": 2.7244937070182286e-06, "loss": 0.0297, "step": 2318 }, { "epoch": 0.6612489307100086, "grad_norm": 0.7855959677784145, "learning_rate": 2.7203820662500625e-06, "loss": 0.0356, "step": 2319 }, { "epoch": 0.6615340747077274, "grad_norm": 0.24261892957888342, "learning_rate": 2.716272370611375e-06, "loss": 0.004, "step": 2320 }, { "epoch": 0.6618192187054462, "grad_norm": 1.5723455114774483, "learning_rate": 2.712164623608844e-06, "loss": 0.0285, "step": 2321 }, { "epoch": 0.6621043627031651, "grad_norm": 0.24075125669855854, "learning_rate": 2.7080588287474885e-06, "loss": 0.0038, "step": 2322 }, { "epoch": 0.6623895067008839, "grad_norm": 0.6828692169124314, "learning_rate": 2.7039549895306593e-06, "loss": 0.0154, "step": 2323 }, { "epoch": 0.6626746506986028, "grad_norm": 0.6957630191781368, "learning_rate": 2.699853109460039e-06, "loss": 0.0097, "step": 2324 }, { "epoch": 0.6629597946963216, "grad_norm": 2.9059174955559723, "learning_rate": 2.695753192035639e-06, "loss": 0.0604, "step": 2325 }, { "epoch": 0.6632449386940404, "grad_norm": 1.1734601436082144, "learning_rate": 2.691655240755795e-06, "loss": 0.0181, "step": 2326 }, { "epoch": 0.6635300826917594, "grad_norm": 1.3146469546781792, "learning_rate": 2.6875592591171663e-06, "loss": 0.0249, "step": 2327 }, { "epoch": 0.6638152266894782, "grad_norm": 0.6224899882604887, "learning_rate": 2.6834652506147297e-06, "loss": 0.0094, "step": 2328 }, { "epoch": 0.6641003706871971, "grad_norm": 0.8606657609714811, "learning_rate": 2.67937321874178e-06, "loss": 0.0117, "step": 2329 }, { "epoch": 0.6643855146849159, "grad_norm": 1.237561338520917, "learning_rate": 2.675283166989926e-06, "loss": 0.0251, "step": 2330 }, { "epoch": 0.6646706586826348, "grad_norm": 1.3971850707966957, "learning_rate": 2.671195098849089e-06, "loss": 0.0557, "step": 2331 }, { "epoch": 0.6649558026803536, "grad_norm": 0.8052143234506958, "learning_rate": 2.6671090178074878e-06, "loss": 0.0147, "step": 2332 }, { "epoch": 0.6652409466780724, "grad_norm": 1.6731504755642395, "learning_rate": 2.663024927351655e-06, "loss": 0.0187, "step": 2333 }, { "epoch": 0.6655260906757913, "grad_norm": 3.2684569195727837, "learning_rate": 2.658942830966425e-06, "loss": 0.0474, "step": 2334 }, { "epoch": 0.6658112346735101, "grad_norm": 1.6686161366860728, "learning_rate": 2.654862732134926e-06, "loss": 0.0957, "step": 2335 }, { "epoch": 0.666096378671229, "grad_norm": 1.8905217359113058, "learning_rate": 2.6507846343385862e-06, "loss": 0.0332, "step": 2336 }, { "epoch": 0.6663815226689478, "grad_norm": 0.8396385938218617, "learning_rate": 2.6467085410571175e-06, "loss": 0.0125, "step": 2337 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6943892112720279, "learning_rate": 2.6426344557685342e-06, "loss": 0.0151, "step": 2338 }, { "epoch": 0.6669518106643855, "grad_norm": 0.8584969710658708, "learning_rate": 2.6385623819491278e-06, "loss": 0.0174, "step": 2339 }, { "epoch": 0.6672369546621043, "grad_norm": 1.1323015265299097, "learning_rate": 2.6344923230734786e-06, "loss": 0.0344, "step": 2340 }, { "epoch": 0.6675220986598233, "grad_norm": 1.2498573670424775, "learning_rate": 2.630424282614441e-06, "loss": 0.0258, "step": 2341 }, { "epoch": 0.6678072426575421, "grad_norm": 1.6270286307278543, "learning_rate": 2.6263582640431595e-06, "loss": 0.0707, "step": 2342 }, { "epoch": 0.6680923866552609, "grad_norm": 0.9402570992440751, "learning_rate": 2.622294270829039e-06, "loss": 0.0289, "step": 2343 }, { "epoch": 0.6683775306529798, "grad_norm": 1.1740285977390892, "learning_rate": 2.6182323064397645e-06, "loss": 0.0236, "step": 2344 }, { "epoch": 0.6686626746506986, "grad_norm": 2.7212202569279995, "learning_rate": 2.614172374341287e-06, "loss": 0.0568, "step": 2345 }, { "epoch": 0.6689478186484175, "grad_norm": 0.6205323905446688, "learning_rate": 2.610114477997827e-06, "loss": 0.0109, "step": 2346 }, { "epoch": 0.6692329626461363, "grad_norm": 2.0357402784016387, "learning_rate": 2.6060586208718673e-06, "loss": 0.0345, "step": 2347 }, { "epoch": 0.6695181066438551, "grad_norm": 1.3978184198875205, "learning_rate": 2.602004806424144e-06, "loss": 0.0201, "step": 2348 }, { "epoch": 0.669803250641574, "grad_norm": 0.8378057178472931, "learning_rate": 2.597953038113655e-06, "loss": 0.013, "step": 2349 }, { "epoch": 0.6700883946392928, "grad_norm": 1.8429505668991712, "learning_rate": 2.5939033193976567e-06, "loss": 0.034, "step": 2350 }, { "epoch": 0.6703735386370117, "grad_norm": 0.6951733957823362, "learning_rate": 2.58985565373165e-06, "loss": 0.0149, "step": 2351 }, { "epoch": 0.6706586826347305, "grad_norm": 1.254638572408408, "learning_rate": 2.585810044569387e-06, "loss": 0.0198, "step": 2352 }, { "epoch": 0.6709438266324493, "grad_norm": 1.2282829118276561, "learning_rate": 2.581766495362864e-06, "loss": 0.0153, "step": 2353 }, { "epoch": 0.6712289706301683, "grad_norm": 1.8682120897475893, "learning_rate": 2.57772500956232e-06, "loss": 0.0286, "step": 2354 }, { "epoch": 0.6715141146278871, "grad_norm": 0.9128992862378721, "learning_rate": 2.573685590616234e-06, "loss": 0.0178, "step": 2355 }, { "epoch": 0.671799258625606, "grad_norm": 1.556527433042271, "learning_rate": 2.5696482419713197e-06, "loss": 0.0441, "step": 2356 }, { "epoch": 0.6720844026233248, "grad_norm": 0.2516116743394582, "learning_rate": 2.565612967072525e-06, "loss": 0.0058, "step": 2357 }, { "epoch": 0.6723695466210436, "grad_norm": 1.191372773291596, "learning_rate": 2.5615797693630328e-06, "loss": 0.0368, "step": 2358 }, { "epoch": 0.6726546906187625, "grad_norm": 1.091184520957333, "learning_rate": 2.557548652284245e-06, "loss": 0.0259, "step": 2359 }, { "epoch": 0.6729398346164813, "grad_norm": 1.0730756075730936, "learning_rate": 2.553519619275794e-06, "loss": 0.0185, "step": 2360 }, { "epoch": 0.6732249786142002, "grad_norm": 1.8119984458090785, "learning_rate": 2.5494926737755306e-06, "loss": 0.0379, "step": 2361 }, { "epoch": 0.673510122611919, "grad_norm": 0.5991375226390366, "learning_rate": 2.5454678192195305e-06, "loss": 0.0154, "step": 2362 }, { "epoch": 0.6737952666096378, "grad_norm": 0.8446181744229467, "learning_rate": 2.5414450590420793e-06, "loss": 0.016, "step": 2363 }, { "epoch": 0.6740804106073567, "grad_norm": 1.2506916116716331, "learning_rate": 2.5374243966756782e-06, "loss": 0.0179, "step": 2364 }, { "epoch": 0.6743655546050755, "grad_norm": 0.7844351247089717, "learning_rate": 2.5334058355510337e-06, "loss": 0.0142, "step": 2365 }, { "epoch": 0.6746506986027944, "grad_norm": 1.2753595775969064, "learning_rate": 2.529389379097067e-06, "loss": 0.0584, "step": 2366 }, { "epoch": 0.6749358426005133, "grad_norm": 2.130096709691072, "learning_rate": 2.5253750307408996e-06, "loss": 0.0433, "step": 2367 }, { "epoch": 0.6752209865982322, "grad_norm": 2.4889057844910223, "learning_rate": 2.5213627939078534e-06, "loss": 0.0529, "step": 2368 }, { "epoch": 0.675506130595951, "grad_norm": 2.0733191308785397, "learning_rate": 2.517352672021449e-06, "loss": 0.0735, "step": 2369 }, { "epoch": 0.6757912745936698, "grad_norm": 0.8721844668658577, "learning_rate": 2.5133446685034048e-06, "loss": 0.02, "step": 2370 }, { "epoch": 0.6760764185913887, "grad_norm": 0.5662408732865061, "learning_rate": 2.5093387867736275e-06, "loss": 0.0185, "step": 2371 }, { "epoch": 0.6763615625891075, "grad_norm": 0.7771568621772995, "learning_rate": 2.5053350302502167e-06, "loss": 0.016, "step": 2372 }, { "epoch": 0.6766467065868264, "grad_norm": 1.699224004876367, "learning_rate": 2.5013334023494556e-06, "loss": 0.0308, "step": 2373 }, { "epoch": 0.6769318505845452, "grad_norm": 0.6987391064530609, "learning_rate": 2.4973339064858163e-06, "loss": 0.0159, "step": 2374 }, { "epoch": 0.677216994582264, "grad_norm": 1.6646679512778157, "learning_rate": 2.49333654607195e-06, "loss": 0.0236, "step": 2375 }, { "epoch": 0.6775021385799829, "grad_norm": 1.6455502915346387, "learning_rate": 2.489341324518678e-06, "loss": 0.0621, "step": 2376 }, { "epoch": 0.6777872825777017, "grad_norm": 0.7215796317053166, "learning_rate": 2.4853482452350048e-06, "loss": 0.0194, "step": 2377 }, { "epoch": 0.6780724265754206, "grad_norm": 0.9265039963157378, "learning_rate": 2.4813573116281083e-06, "loss": 0.0287, "step": 2378 }, { "epoch": 0.6783575705731394, "grad_norm": 1.386647458592848, "learning_rate": 2.477368527103329e-06, "loss": 0.0276, "step": 2379 }, { "epoch": 0.6786427145708582, "grad_norm": 1.0194721860507407, "learning_rate": 2.473381895064181e-06, "loss": 0.0569, "step": 2380 }, { "epoch": 0.6789278585685772, "grad_norm": 1.5781901137615435, "learning_rate": 2.46939741891233e-06, "loss": 0.0219, "step": 2381 }, { "epoch": 0.679213002566296, "grad_norm": 0.9271661128511948, "learning_rate": 2.4654151020476165e-06, "loss": 0.0154, "step": 2382 }, { "epoch": 0.6794981465640149, "grad_norm": 1.2929840875659686, "learning_rate": 2.461434947868029e-06, "loss": 0.0336, "step": 2383 }, { "epoch": 0.6797832905617337, "grad_norm": 0.8356221756560887, "learning_rate": 2.4574569597697145e-06, "loss": 0.0202, "step": 2384 }, { "epoch": 0.6800684345594525, "grad_norm": 0.664795804036026, "learning_rate": 2.4534811411469704e-06, "loss": 0.0104, "step": 2385 }, { "epoch": 0.6803535785571714, "grad_norm": 1.9929916630870672, "learning_rate": 2.4495074953922425e-06, "loss": 0.0277, "step": 2386 }, { "epoch": 0.6806387225548902, "grad_norm": 1.5262298229248654, "learning_rate": 2.4455360258961247e-06, "loss": 0.0325, "step": 2387 }, { "epoch": 0.6809238665526091, "grad_norm": 0.7087902063036329, "learning_rate": 2.4415667360473518e-06, "loss": 0.0165, "step": 2388 }, { "epoch": 0.6812090105503279, "grad_norm": 0.7561419087572873, "learning_rate": 2.437599629232797e-06, "loss": 0.011, "step": 2389 }, { "epoch": 0.6814941545480467, "grad_norm": 1.045806215953938, "learning_rate": 2.433634708837478e-06, "loss": 0.0236, "step": 2390 }, { "epoch": 0.6817792985457656, "grad_norm": 1.3375850382967915, "learning_rate": 2.4296719782445422e-06, "loss": 0.0322, "step": 2391 }, { "epoch": 0.6820644425434844, "grad_norm": 1.442188021651849, "learning_rate": 2.4257114408352646e-06, "loss": 0.0382, "step": 2392 }, { "epoch": 0.6823495865412033, "grad_norm": 1.4811177425124444, "learning_rate": 2.421753099989052e-06, "loss": 0.0399, "step": 2393 }, { "epoch": 0.6826347305389222, "grad_norm": 1.2151409515623008, "learning_rate": 2.4177969590834425e-06, "loss": 0.0344, "step": 2394 }, { "epoch": 0.682919874536641, "grad_norm": 2.7056727418479958, "learning_rate": 2.4138430214940906e-06, "loss": 0.0551, "step": 2395 }, { "epoch": 0.6832050185343599, "grad_norm": 0.5969770040702388, "learning_rate": 2.4098912905947712e-06, "loss": 0.015, "step": 2396 }, { "epoch": 0.6834901625320787, "grad_norm": 1.4726176220224632, "learning_rate": 2.4059417697573782e-06, "loss": 0.0191, "step": 2397 }, { "epoch": 0.6837753065297976, "grad_norm": 0.6985703906149163, "learning_rate": 2.4019944623519194e-06, "loss": 0.0217, "step": 2398 }, { "epoch": 0.6840604505275164, "grad_norm": 1.3556092377104045, "learning_rate": 2.3980493717465124e-06, "loss": 0.0314, "step": 2399 }, { "epoch": 0.6843455945252352, "grad_norm": 1.6320428555492388, "learning_rate": 2.394106501307386e-06, "loss": 0.0406, "step": 2400 }, { "epoch": 0.6846307385229541, "grad_norm": 0.3152755510356312, "learning_rate": 2.390165854398872e-06, "loss": 0.0076, "step": 2401 }, { "epoch": 0.6849158825206729, "grad_norm": 0.5496493802305953, "learning_rate": 2.386227434383407e-06, "loss": 0.0095, "step": 2402 }, { "epoch": 0.6852010265183918, "grad_norm": 0.6520718654651182, "learning_rate": 2.3822912446215244e-06, "loss": 0.0179, "step": 2403 }, { "epoch": 0.6854861705161106, "grad_norm": 1.2473935565493588, "learning_rate": 2.3783572884718592e-06, "loss": 0.0319, "step": 2404 }, { "epoch": 0.6857713145138294, "grad_norm": 1.0535903429708244, "learning_rate": 2.3744255692911345e-06, "loss": 0.015, "step": 2405 }, { "epoch": 0.6860564585115483, "grad_norm": 1.153321160939757, "learning_rate": 2.370496090434172e-06, "loss": 0.0243, "step": 2406 }, { "epoch": 0.6863416025092672, "grad_norm": 0.8326855476595857, "learning_rate": 2.3665688552538767e-06, "loss": 0.016, "step": 2407 }, { "epoch": 0.6866267465069861, "grad_norm": 1.4628607016553867, "learning_rate": 2.3626438671012412e-06, "loss": 0.0411, "step": 2408 }, { "epoch": 0.6869118905047049, "grad_norm": 0.5985342385584901, "learning_rate": 2.358721129325336e-06, "loss": 0.0153, "step": 2409 }, { "epoch": 0.6871970345024238, "grad_norm": 0.8315001817874339, "learning_rate": 2.354800645273319e-06, "loss": 0.0314, "step": 2410 }, { "epoch": 0.6874821785001426, "grad_norm": 0.6885528098745334, "learning_rate": 2.3508824182904207e-06, "loss": 0.0124, "step": 2411 }, { "epoch": 0.6877673224978614, "grad_norm": 1.9141392122493273, "learning_rate": 2.3469664517199463e-06, "loss": 0.0354, "step": 2412 }, { "epoch": 0.6880524664955803, "grad_norm": 1.1665319631123647, "learning_rate": 2.3430527489032723e-06, "loss": 0.0215, "step": 2413 }, { "epoch": 0.6883376104932991, "grad_norm": 1.0810004667670476, "learning_rate": 2.339141313179844e-06, "loss": 0.0321, "step": 2414 }, { "epoch": 0.688622754491018, "grad_norm": 0.8222383468643456, "learning_rate": 2.3352321478871717e-06, "loss": 0.023, "step": 2415 }, { "epoch": 0.6889078984887368, "grad_norm": 1.1477329288258749, "learning_rate": 2.331325256360828e-06, "loss": 0.0147, "step": 2416 }, { "epoch": 0.6891930424864556, "grad_norm": 0.4781898959973014, "learning_rate": 2.327420641934447e-06, "loss": 0.0059, "step": 2417 }, { "epoch": 0.6894781864841745, "grad_norm": 1.1660744412330555, "learning_rate": 2.323518307939717e-06, "loss": 0.0294, "step": 2418 }, { "epoch": 0.6897633304818933, "grad_norm": 0.5872242883588114, "learning_rate": 2.3196182577063868e-06, "loss": 0.0121, "step": 2419 }, { "epoch": 0.6900484744796122, "grad_norm": 2.212762462397327, "learning_rate": 2.315720494562248e-06, "loss": 0.0324, "step": 2420 }, { "epoch": 0.6903336184773311, "grad_norm": 1.6373711381327187, "learning_rate": 2.3118250218331463e-06, "loss": 0.0185, "step": 2421 }, { "epoch": 0.6906187624750499, "grad_norm": 0.7770141753292534, "learning_rate": 2.307931842842968e-06, "loss": 0.0299, "step": 2422 }, { "epoch": 0.6909039064727688, "grad_norm": 1.518201141107165, "learning_rate": 2.3040409609136515e-06, "loss": 0.0463, "step": 2423 }, { "epoch": 0.6911890504704876, "grad_norm": 0.43241598666358966, "learning_rate": 2.3001523793651688e-06, "loss": 0.0124, "step": 2424 }, { "epoch": 0.6914741944682065, "grad_norm": 1.0771130938316928, "learning_rate": 2.2962661015155234e-06, "loss": 0.0561, "step": 2425 }, { "epoch": 0.6917593384659253, "grad_norm": 0.9154394384341286, "learning_rate": 2.292382130680766e-06, "loss": 0.0182, "step": 2426 }, { "epoch": 0.6920444824636441, "grad_norm": 0.9492377155776165, "learning_rate": 2.2885004701749695e-06, "loss": 0.0181, "step": 2427 }, { "epoch": 0.692329626461363, "grad_norm": 1.1013460236131432, "learning_rate": 2.2846211233102387e-06, "loss": 0.03, "step": 2428 }, { "epoch": 0.6926147704590818, "grad_norm": 1.1127112319248678, "learning_rate": 2.2807440933967034e-06, "loss": 0.021, "step": 2429 }, { "epoch": 0.6928999144568007, "grad_norm": 0.9531578556692747, "learning_rate": 2.276869383742517e-06, "loss": 0.0149, "step": 2430 }, { "epoch": 0.6931850584545195, "grad_norm": 1.0679032804811803, "learning_rate": 2.2729969976538524e-06, "loss": 0.0187, "step": 2431 }, { "epoch": 0.6934702024522383, "grad_norm": 2.528890187837627, "learning_rate": 2.2691269384349007e-06, "loss": 0.0397, "step": 2432 }, { "epoch": 0.6937553464499572, "grad_norm": 1.366923157548722, "learning_rate": 2.265259209387867e-06, "loss": 0.0273, "step": 2433 }, { "epoch": 0.6940404904476761, "grad_norm": 0.6850783877005101, "learning_rate": 2.261393813812966e-06, "loss": 0.015, "step": 2434 }, { "epoch": 0.694325634445395, "grad_norm": 1.0072621570466294, "learning_rate": 2.2575307550084295e-06, "loss": 0.0234, "step": 2435 }, { "epoch": 0.6946107784431138, "grad_norm": 1.3465168819525803, "learning_rate": 2.2536700362704846e-06, "loss": 0.0213, "step": 2436 }, { "epoch": 0.6948959224408326, "grad_norm": 1.0391955609835548, "learning_rate": 2.2498116608933673e-06, "loss": 0.0157, "step": 2437 }, { "epoch": 0.6951810664385515, "grad_norm": 2.814771203840875, "learning_rate": 2.2459556321693123e-06, "loss": 0.0552, "step": 2438 }, { "epoch": 0.6954662104362703, "grad_norm": 0.6121746444624311, "learning_rate": 2.242101953388556e-06, "loss": 0.0112, "step": 2439 }, { "epoch": 0.6957513544339892, "grad_norm": 1.3291384876570984, "learning_rate": 2.238250627839325e-06, "loss": 0.0131, "step": 2440 }, { "epoch": 0.696036498431708, "grad_norm": 1.1148763168142317, "learning_rate": 2.2344016588078403e-06, "loss": 0.0354, "step": 2441 }, { "epoch": 0.6963216424294268, "grad_norm": 1.36958013088009, "learning_rate": 2.230555049578312e-06, "loss": 0.029, "step": 2442 }, { "epoch": 0.6966067864271457, "grad_norm": 0.7790771298504943, "learning_rate": 2.2267108034329343e-06, "loss": 0.0127, "step": 2443 }, { "epoch": 0.6968919304248645, "grad_norm": 1.1333344045819018, "learning_rate": 2.222868923651888e-06, "loss": 0.0225, "step": 2444 }, { "epoch": 0.6971770744225834, "grad_norm": 1.2892580769517954, "learning_rate": 2.2190294135133334e-06, "loss": 0.016, "step": 2445 }, { "epoch": 0.6974622184203022, "grad_norm": 2.2596154885188304, "learning_rate": 2.2151922762934096e-06, "loss": 0.0359, "step": 2446 }, { "epoch": 0.6977473624180212, "grad_norm": 1.5204543539753177, "learning_rate": 2.2113575152662304e-06, "loss": 0.0225, "step": 2447 }, { "epoch": 0.69803250641574, "grad_norm": 2.129816314096946, "learning_rate": 2.207525133703881e-06, "loss": 0.0312, "step": 2448 }, { "epoch": 0.6983176504134588, "grad_norm": 0.8726329016607753, "learning_rate": 2.203695134876419e-06, "loss": 0.0129, "step": 2449 }, { "epoch": 0.6986027944111777, "grad_norm": 0.981509542326578, "learning_rate": 2.199867522051865e-06, "loss": 0.0306, "step": 2450 }, { "epoch": 0.6988879384088965, "grad_norm": 1.2233864788734838, "learning_rate": 2.1960422984962094e-06, "loss": 0.0457, "step": 2451 }, { "epoch": 0.6991730824066154, "grad_norm": 1.121663408796296, "learning_rate": 2.1922194674734003e-06, "loss": 0.0278, "step": 2452 }, { "epoch": 0.6994582264043342, "grad_norm": 1.612742447635186, "learning_rate": 2.1883990322453414e-06, "loss": 0.0537, "step": 2453 }, { "epoch": 0.699743370402053, "grad_norm": 0.9032206069159022, "learning_rate": 2.184580996071895e-06, "loss": 0.0382, "step": 2454 }, { "epoch": 0.7000285143997719, "grad_norm": 1.3020251741485003, "learning_rate": 2.1807653622108797e-06, "loss": 0.0511, "step": 2455 }, { "epoch": 0.7003136583974907, "grad_norm": 0.927814184904209, "learning_rate": 2.1769521339180604e-06, "loss": 0.0291, "step": 2456 }, { "epoch": 0.7005988023952096, "grad_norm": 1.4080121788852615, "learning_rate": 2.17314131444715e-06, "loss": 0.0168, "step": 2457 }, { "epoch": 0.7008839463929284, "grad_norm": 1.9115355677255668, "learning_rate": 2.1693329070498057e-06, "loss": 0.0243, "step": 2458 }, { "epoch": 0.7011690903906472, "grad_norm": 0.6119517264756272, "learning_rate": 2.165526914975628e-06, "loss": 0.0128, "step": 2459 }, { "epoch": 0.7014542343883661, "grad_norm": 1.2212817821331179, "learning_rate": 2.1617233414721546e-06, "loss": 0.0124, "step": 2460 }, { "epoch": 0.701739378386085, "grad_norm": 1.0456467000073637, "learning_rate": 2.1579221897848608e-06, "loss": 0.0225, "step": 2461 }, { "epoch": 0.7020245223838039, "grad_norm": 1.9001029978437518, "learning_rate": 2.1541234631571533e-06, "loss": 0.0347, "step": 2462 }, { "epoch": 0.7023096663815227, "grad_norm": 1.1170803439317123, "learning_rate": 2.1503271648303776e-06, "loss": 0.0158, "step": 2463 }, { "epoch": 0.7025948103792415, "grad_norm": 1.2513944489653663, "learning_rate": 2.1465332980437937e-06, "loss": 0.0185, "step": 2464 }, { "epoch": 0.7028799543769604, "grad_norm": 0.9513327072225457, "learning_rate": 2.1427418660345978e-06, "loss": 0.0162, "step": 2465 }, { "epoch": 0.7031650983746792, "grad_norm": 1.0188208991697878, "learning_rate": 2.138952872037902e-06, "loss": 0.044, "step": 2466 }, { "epoch": 0.7034502423723981, "grad_norm": 1.5772768139954325, "learning_rate": 2.135166319286745e-06, "loss": 0.05, "step": 2467 }, { "epoch": 0.7037353863701169, "grad_norm": 1.8088612980621237, "learning_rate": 2.1313822110120787e-06, "loss": 0.0434, "step": 2468 }, { "epoch": 0.7040205303678357, "grad_norm": 0.589663362629804, "learning_rate": 2.1276005504427643e-06, "loss": 0.0089, "step": 2469 }, { "epoch": 0.7043056743655546, "grad_norm": 1.0455626616060543, "learning_rate": 2.1238213408055806e-06, "loss": 0.0252, "step": 2470 }, { "epoch": 0.7045908183632734, "grad_norm": 1.2419091857004885, "learning_rate": 2.1200445853252165e-06, "loss": 0.0313, "step": 2471 }, { "epoch": 0.7048759623609923, "grad_norm": 1.309158885132024, "learning_rate": 2.116270287224262e-06, "loss": 0.016, "step": 2472 }, { "epoch": 0.7051611063587111, "grad_norm": 2.46200008191099, "learning_rate": 2.1124984497232127e-06, "loss": 0.0427, "step": 2473 }, { "epoch": 0.70544625035643, "grad_norm": 1.1686060450283622, "learning_rate": 2.1087290760404634e-06, "loss": 0.0226, "step": 2474 }, { "epoch": 0.7057313943541489, "grad_norm": 1.4653606459109394, "learning_rate": 2.1049621693923084e-06, "loss": 0.0413, "step": 2475 }, { "epoch": 0.7060165383518677, "grad_norm": 0.7719168267016131, "learning_rate": 2.101197732992935e-06, "loss": 0.0205, "step": 2476 }, { "epoch": 0.7063016823495866, "grad_norm": 0.4502961937417831, "learning_rate": 2.0974357700544244e-06, "loss": 0.0117, "step": 2477 }, { "epoch": 0.7065868263473054, "grad_norm": 1.385954489989293, "learning_rate": 2.0936762837867445e-06, "loss": 0.0242, "step": 2478 }, { "epoch": 0.7068719703450242, "grad_norm": 2.1799652784336216, "learning_rate": 2.0899192773977574e-06, "loss": 0.0569, "step": 2479 }, { "epoch": 0.7071571143427431, "grad_norm": 1.1110636882709009, "learning_rate": 2.086164754093198e-06, "loss": 0.0316, "step": 2480 }, { "epoch": 0.7074422583404619, "grad_norm": 1.4941307442971459, "learning_rate": 2.0824127170766904e-06, "loss": 0.0239, "step": 2481 }, { "epoch": 0.7077274023381808, "grad_norm": 0.6801030858403582, "learning_rate": 2.0786631695497335e-06, "loss": 0.0135, "step": 2482 }, { "epoch": 0.7080125463358996, "grad_norm": 1.1364604361529025, "learning_rate": 2.074916114711706e-06, "loss": 0.0166, "step": 2483 }, { "epoch": 0.7082976903336184, "grad_norm": 1.2388624592631503, "learning_rate": 2.071171555759856e-06, "loss": 0.02, "step": 2484 }, { "epoch": 0.7085828343313373, "grad_norm": 0.9231133048338788, "learning_rate": 2.0674294958893052e-06, "loss": 0.0132, "step": 2485 }, { "epoch": 0.7088679783290561, "grad_norm": 1.0122357410820453, "learning_rate": 2.0636899382930357e-06, "loss": 0.0195, "step": 2486 }, { "epoch": 0.7091531223267751, "grad_norm": 1.0915144391953777, "learning_rate": 2.0599528861619046e-06, "loss": 0.0218, "step": 2487 }, { "epoch": 0.7094382663244939, "grad_norm": 0.5781606335754661, "learning_rate": 2.056218342684624e-06, "loss": 0.0138, "step": 2488 }, { "epoch": 0.7097234103222128, "grad_norm": 1.2034483165455416, "learning_rate": 2.0524863110477683e-06, "loss": 0.0208, "step": 2489 }, { "epoch": 0.7100085543199316, "grad_norm": 1.0223652929411267, "learning_rate": 2.0487567944357658e-06, "loss": 0.0177, "step": 2490 }, { "epoch": 0.7102936983176504, "grad_norm": 0.7589363858158285, "learning_rate": 2.0450297960309057e-06, "loss": 0.0191, "step": 2491 }, { "epoch": 0.7105788423153693, "grad_norm": 1.7624288774750982, "learning_rate": 2.0413053190133198e-06, "loss": 0.0462, "step": 2492 }, { "epoch": 0.7108639863130881, "grad_norm": 0.9123674547573875, "learning_rate": 2.0375833665609927e-06, "loss": 0.0349, "step": 2493 }, { "epoch": 0.711149130310807, "grad_norm": 1.1539174639109313, "learning_rate": 2.033863941849754e-06, "loss": 0.0383, "step": 2494 }, { "epoch": 0.7114342743085258, "grad_norm": 0.9661273666986842, "learning_rate": 2.0301470480532803e-06, "loss": 0.0296, "step": 2495 }, { "epoch": 0.7117194183062446, "grad_norm": 1.101711554212656, "learning_rate": 2.026432688343085e-06, "loss": 0.0322, "step": 2496 }, { "epoch": 0.7120045623039635, "grad_norm": 1.700057021557032, "learning_rate": 2.0227208658885167e-06, "loss": 0.0282, "step": 2497 }, { "epoch": 0.7122897063016823, "grad_norm": 0.6125316137983147, "learning_rate": 2.019011583856761e-06, "loss": 0.009, "step": 2498 }, { "epoch": 0.7125748502994012, "grad_norm": 2.1882827226129273, "learning_rate": 2.015304845412841e-06, "loss": 0.0422, "step": 2499 }, { "epoch": 0.71285999429712, "grad_norm": 0.9211104223469543, "learning_rate": 2.0116006537196033e-06, "loss": 0.0327, "step": 2500 }, { "epoch": 0.7131451382948389, "grad_norm": 1.0694540209104813, "learning_rate": 2.0078990119377233e-06, "loss": 0.0279, "step": 2501 }, { "epoch": 0.7134302822925578, "grad_norm": 1.388175054161624, "learning_rate": 2.004199923225701e-06, "loss": 0.0259, "step": 2502 }, { "epoch": 0.7137154262902766, "grad_norm": 1.0988981162449094, "learning_rate": 2.0005033907398574e-06, "loss": 0.0212, "step": 2503 }, { "epoch": 0.7140005702879955, "grad_norm": 1.1482399987583405, "learning_rate": 1.9968094176343322e-06, "loss": 0.0281, "step": 2504 }, { "epoch": 0.7142857142857143, "grad_norm": 0.5501580823692963, "learning_rate": 1.9931180070610823e-06, "loss": 0.012, "step": 2505 }, { "epoch": 0.7145708582834331, "grad_norm": 1.2633694375174702, "learning_rate": 1.989429162169876e-06, "loss": 0.0304, "step": 2506 }, { "epoch": 0.714856002281152, "grad_norm": 0.6924873154457065, "learning_rate": 1.9857428861082976e-06, "loss": 0.0141, "step": 2507 }, { "epoch": 0.7151411462788708, "grad_norm": 0.6065820485902311, "learning_rate": 1.9820591820217315e-06, "loss": 0.0207, "step": 2508 }, { "epoch": 0.7154262902765897, "grad_norm": 0.7765390907982586, "learning_rate": 1.978378053053373e-06, "loss": 0.0139, "step": 2509 }, { "epoch": 0.7157114342743085, "grad_norm": 2.0988006081981316, "learning_rate": 1.9746995023442177e-06, "loss": 0.0231, "step": 2510 }, { "epoch": 0.7159965782720273, "grad_norm": 1.0519010093126555, "learning_rate": 1.9710235330330656e-06, "loss": 0.0181, "step": 2511 }, { "epoch": 0.7162817222697462, "grad_norm": 1.1614853715616795, "learning_rate": 1.9673501482565083e-06, "loss": 0.0341, "step": 2512 }, { "epoch": 0.716566866267465, "grad_norm": 0.8821473934969634, "learning_rate": 1.9636793511489377e-06, "loss": 0.0229, "step": 2513 }, { "epoch": 0.716852010265184, "grad_norm": 0.5250158672929854, "learning_rate": 1.9600111448425285e-06, "loss": 0.0159, "step": 2514 }, { "epoch": 0.7171371542629028, "grad_norm": 1.9676827735955886, "learning_rate": 1.9563455324672566e-06, "loss": 0.033, "step": 2515 }, { "epoch": 0.7174222982606216, "grad_norm": 0.648474303996466, "learning_rate": 1.952682517150877e-06, "loss": 0.0115, "step": 2516 }, { "epoch": 0.7177074422583405, "grad_norm": 1.3772601549572416, "learning_rate": 1.9490221020189306e-06, "loss": 0.0301, "step": 2517 }, { "epoch": 0.7179925862560593, "grad_norm": 1.526408416109426, "learning_rate": 1.945364290194739e-06, "loss": 0.0301, "step": 2518 }, { "epoch": 0.7182777302537782, "grad_norm": 1.00941806936837, "learning_rate": 1.941709084799404e-06, "loss": 0.0403, "step": 2519 }, { "epoch": 0.718562874251497, "grad_norm": 0.7841790764743731, "learning_rate": 1.9380564889518027e-06, "loss": 0.0206, "step": 2520 }, { "epoch": 0.7188480182492158, "grad_norm": 0.3196980040678001, "learning_rate": 1.9344065057685844e-06, "loss": 0.0117, "step": 2521 }, { "epoch": 0.7191331622469347, "grad_norm": 1.2494691029010219, "learning_rate": 1.9307591383641704e-06, "loss": 0.0271, "step": 2522 }, { "epoch": 0.7194183062446535, "grad_norm": 3.35964679264394, "learning_rate": 1.927114389850749e-06, "loss": 0.0591, "step": 2523 }, { "epoch": 0.7197034502423724, "grad_norm": 1.6156104346637272, "learning_rate": 1.923472263338278e-06, "loss": 0.0249, "step": 2524 }, { "epoch": 0.7199885942400912, "grad_norm": 1.1690876897119913, "learning_rate": 1.919832761934471e-06, "loss": 0.0336, "step": 2525 }, { "epoch": 0.72027373823781, "grad_norm": 0.4901043573896946, "learning_rate": 1.9161958887448036e-06, "loss": 0.0087, "step": 2526 }, { "epoch": 0.720558882235529, "grad_norm": 0.5766787110046747, "learning_rate": 1.912561646872515e-06, "loss": 0.0194, "step": 2527 }, { "epoch": 0.7208440262332478, "grad_norm": 0.4067719479340745, "learning_rate": 1.908930039418593e-06, "loss": 0.0074, "step": 2528 }, { "epoch": 0.7211291702309667, "grad_norm": 0.8190616283681241, "learning_rate": 1.9053010694817792e-06, "loss": 0.0151, "step": 2529 }, { "epoch": 0.7214143142286855, "grad_norm": 1.4745099441171658, "learning_rate": 1.9016747401585612e-06, "loss": 0.0192, "step": 2530 }, { "epoch": 0.7216994582264044, "grad_norm": 1.1626077125676921, "learning_rate": 1.8980510545431813e-06, "loss": 0.0336, "step": 2531 }, { "epoch": 0.7219846022241232, "grad_norm": 2.3232659525233785, "learning_rate": 1.89443001572762e-06, "loss": 0.0488, "step": 2532 }, { "epoch": 0.722269746221842, "grad_norm": 0.4293645535795284, "learning_rate": 1.8908116268016009e-06, "loss": 0.0186, "step": 2533 }, { "epoch": 0.7225548902195609, "grad_norm": 0.7057630192922806, "learning_rate": 1.8871958908525861e-06, "loss": 0.0165, "step": 2534 }, { "epoch": 0.7228400342172797, "grad_norm": 0.7801979051938523, "learning_rate": 1.883582810965775e-06, "loss": 0.0173, "step": 2535 }, { "epoch": 0.7231251782149986, "grad_norm": 0.6054884481515708, "learning_rate": 1.8799723902240995e-06, "loss": 0.012, "step": 2536 }, { "epoch": 0.7234103222127174, "grad_norm": 0.7295030558864235, "learning_rate": 1.8763646317082234e-06, "loss": 0.0143, "step": 2537 }, { "epoch": 0.7236954662104362, "grad_norm": 0.2348493963859001, "learning_rate": 1.872759538496539e-06, "loss": 0.0063, "step": 2538 }, { "epoch": 0.7239806102081551, "grad_norm": 1.5317613085812878, "learning_rate": 1.869157113665162e-06, "loss": 0.0177, "step": 2539 }, { "epoch": 0.7242657542058739, "grad_norm": 1.1185611844047032, "learning_rate": 1.8655573602879384e-06, "loss": 0.0406, "step": 2540 }, { "epoch": 0.7245508982035929, "grad_norm": 0.7553842956065492, "learning_rate": 1.8619602814364241e-06, "loss": 0.0142, "step": 2541 }, { "epoch": 0.7248360422013117, "grad_norm": 1.613711705936186, "learning_rate": 1.8583658801798988e-06, "loss": 0.021, "step": 2542 }, { "epoch": 0.7251211861990305, "grad_norm": 0.6639471861383226, "learning_rate": 1.8547741595853603e-06, "loss": 0.0152, "step": 2543 }, { "epoch": 0.7254063301967494, "grad_norm": 1.043444007239291, "learning_rate": 1.8511851227175142e-06, "loss": 0.0163, "step": 2544 }, { "epoch": 0.7256914741944682, "grad_norm": 1.3859476624660125, "learning_rate": 1.8475987726387783e-06, "loss": 0.028, "step": 2545 }, { "epoch": 0.7259766181921871, "grad_norm": 1.2707053562031854, "learning_rate": 1.8440151124092764e-06, "loss": 0.0203, "step": 2546 }, { "epoch": 0.7262617621899059, "grad_norm": 0.7743398165083601, "learning_rate": 1.8404341450868385e-06, "loss": 0.0225, "step": 2547 }, { "epoch": 0.7265469061876247, "grad_norm": 1.4948566545401907, "learning_rate": 1.836855873726997e-06, "loss": 0.0289, "step": 2548 }, { "epoch": 0.7268320501853436, "grad_norm": 1.4926542884907947, "learning_rate": 1.8332803013829824e-06, "loss": 0.0314, "step": 2549 }, { "epoch": 0.7271171941830624, "grad_norm": 1.7158591542352042, "learning_rate": 1.8297074311057233e-06, "loss": 0.0264, "step": 2550 }, { "epoch": 0.7274023381807813, "grad_norm": 0.7051197790810769, "learning_rate": 1.826137265943843e-06, "loss": 0.0123, "step": 2551 }, { "epoch": 0.7276874821785001, "grad_norm": 1.2332019677453634, "learning_rate": 1.822569808943656e-06, "loss": 0.0255, "step": 2552 }, { "epoch": 0.7279726261762189, "grad_norm": 1.4227488885844022, "learning_rate": 1.8190050631491662e-06, "loss": 0.0186, "step": 2553 }, { "epoch": 0.7282577701739379, "grad_norm": 0.888470253708542, "learning_rate": 1.8154430316020638e-06, "loss": 0.0091, "step": 2554 }, { "epoch": 0.7285429141716567, "grad_norm": 0.6362280906290803, "learning_rate": 1.811883717341722e-06, "loss": 0.0227, "step": 2555 }, { "epoch": 0.7288280581693756, "grad_norm": 2.0627256070084683, "learning_rate": 1.8083271234051991e-06, "loss": 0.0383, "step": 2556 }, { "epoch": 0.7291132021670944, "grad_norm": 1.1067413056479523, "learning_rate": 1.804773252827231e-06, "loss": 0.0333, "step": 2557 }, { "epoch": 0.7293983461648132, "grad_norm": 2.0036690116211333, "learning_rate": 1.8012221086402226e-06, "loss": 0.0302, "step": 2558 }, { "epoch": 0.7296834901625321, "grad_norm": 1.7330904351751883, "learning_rate": 1.7976736938742646e-06, "loss": 0.0202, "step": 2559 }, { "epoch": 0.7299686341602509, "grad_norm": 1.9066727681418387, "learning_rate": 1.7941280115571103e-06, "loss": 0.0521, "step": 2560 }, { "epoch": 0.7302537781579698, "grad_norm": 0.8350093333223596, "learning_rate": 1.7905850647141842e-06, "loss": 0.0174, "step": 2561 }, { "epoch": 0.7305389221556886, "grad_norm": 0.8935901878058463, "learning_rate": 1.787044856368576e-06, "loss": 0.0124, "step": 2562 }, { "epoch": 0.7308240661534074, "grad_norm": 0.2609415835600623, "learning_rate": 1.7835073895410393e-06, "loss": 0.0034, "step": 2563 }, { "epoch": 0.7311092101511263, "grad_norm": 0.7383110088545776, "learning_rate": 1.779972667249989e-06, "loss": 0.0132, "step": 2564 }, { "epoch": 0.7313943541488451, "grad_norm": 0.82098631234755, "learning_rate": 1.7764406925114957e-06, "loss": 0.0181, "step": 2565 }, { "epoch": 0.731679498146564, "grad_norm": 1.2122779163003397, "learning_rate": 1.7729114683392889e-06, "loss": 0.0186, "step": 2566 }, { "epoch": 0.7319646421442829, "grad_norm": 1.0409332876987545, "learning_rate": 1.769384997744747e-06, "loss": 0.0218, "step": 2567 }, { "epoch": 0.7322497861420018, "grad_norm": 1.0657193928152229, "learning_rate": 1.7658612837369065e-06, "loss": 0.0255, "step": 2568 }, { "epoch": 0.7325349301397206, "grad_norm": 2.009539824029055, "learning_rate": 1.7623403293224423e-06, "loss": 0.0335, "step": 2569 }, { "epoch": 0.7328200741374394, "grad_norm": 0.47135702796339735, "learning_rate": 1.7588221375056797e-06, "loss": 0.0124, "step": 2570 }, { "epoch": 0.7331052181351583, "grad_norm": 0.5183616314191448, "learning_rate": 1.7553067112885846e-06, "loss": 0.0047, "step": 2571 }, { "epoch": 0.7333903621328771, "grad_norm": 0.9381090221129461, "learning_rate": 1.751794053670769e-06, "loss": 0.0172, "step": 2572 }, { "epoch": 0.733675506130596, "grad_norm": 1.5789937408733317, "learning_rate": 1.7482841676494766e-06, "loss": 0.0205, "step": 2573 }, { "epoch": 0.7339606501283148, "grad_norm": 0.5928552002577685, "learning_rate": 1.7447770562195831e-06, "loss": 0.0137, "step": 2574 }, { "epoch": 0.7342457941260336, "grad_norm": 1.1734406658202, "learning_rate": 1.741272722373607e-06, "loss": 0.0276, "step": 2575 }, { "epoch": 0.7345309381237525, "grad_norm": 1.3469595275728594, "learning_rate": 1.7377711691016885e-06, "loss": 0.0291, "step": 2576 }, { "epoch": 0.7348160821214713, "grad_norm": 1.0553587040388435, "learning_rate": 1.7342723993915984e-06, "loss": 0.0337, "step": 2577 }, { "epoch": 0.7351012261191902, "grad_norm": 0.6104053843204919, "learning_rate": 1.730776416228731e-06, "loss": 0.0106, "step": 2578 }, { "epoch": 0.735386370116909, "grad_norm": 1.180139523029748, "learning_rate": 1.727283222596105e-06, "loss": 0.0286, "step": 2579 }, { "epoch": 0.7356715141146278, "grad_norm": 0.8943007056784072, "learning_rate": 1.723792821474356e-06, "loss": 0.0327, "step": 2580 }, { "epoch": 0.7359566581123468, "grad_norm": 1.407472268943547, "learning_rate": 1.7203052158417395e-06, "loss": 0.039, "step": 2581 }, { "epoch": 0.7362418021100656, "grad_norm": 1.2414400485072907, "learning_rate": 1.7168204086741242e-06, "loss": 0.0365, "step": 2582 }, { "epoch": 0.7365269461077845, "grad_norm": 1.045965446497316, "learning_rate": 1.7133384029449895e-06, "loss": 0.0134, "step": 2583 }, { "epoch": 0.7368120901055033, "grad_norm": 1.3401194939322665, "learning_rate": 1.7098592016254318e-06, "loss": 0.0192, "step": 2584 }, { "epoch": 0.7370972341032221, "grad_norm": 0.4330957022031905, "learning_rate": 1.7063828076841433e-06, "loss": 0.0042, "step": 2585 }, { "epoch": 0.737382378100941, "grad_norm": 0.6130445246080449, "learning_rate": 1.7029092240874284e-06, "loss": 0.0089, "step": 2586 }, { "epoch": 0.7376675220986598, "grad_norm": 0.8820092232571117, "learning_rate": 1.6994384537991898e-06, "loss": 0.011, "step": 2587 }, { "epoch": 0.7379526660963787, "grad_norm": 1.0509254864322741, "learning_rate": 1.6959704997809355e-06, "loss": 0.0291, "step": 2588 }, { "epoch": 0.7382378100940975, "grad_norm": 1.3113452555468992, "learning_rate": 1.6925053649917645e-06, "loss": 0.0255, "step": 2589 }, { "epoch": 0.7385229540918163, "grad_norm": 1.1114160968697633, "learning_rate": 1.6890430523883715e-06, "loss": 0.0176, "step": 2590 }, { "epoch": 0.7388080980895352, "grad_norm": 1.4047944529889076, "learning_rate": 1.6855835649250446e-06, "loss": 0.0161, "step": 2591 }, { "epoch": 0.739093242087254, "grad_norm": 1.268771098085473, "learning_rate": 1.6821269055536604e-06, "loss": 0.0359, "step": 2592 }, { "epoch": 0.7393783860849729, "grad_norm": 1.0412207129886348, "learning_rate": 1.678673077223682e-06, "loss": 0.0111, "step": 2593 }, { "epoch": 0.7396635300826918, "grad_norm": 1.9488662026753016, "learning_rate": 1.6752220828821574e-06, "loss": 0.0355, "step": 2594 }, { "epoch": 0.7399486740804107, "grad_norm": 0.6388958326044936, "learning_rate": 1.671773925473717e-06, "loss": 0.0161, "step": 2595 }, { "epoch": 0.7402338180781295, "grad_norm": 1.3676324169071015, "learning_rate": 1.6683286079405692e-06, "loss": 0.0446, "step": 2596 }, { "epoch": 0.7405189620758483, "grad_norm": 0.7729955227718246, "learning_rate": 1.6648861332225002e-06, "loss": 0.0168, "step": 2597 }, { "epoch": 0.7408041060735672, "grad_norm": 0.6524911607576815, "learning_rate": 1.66144650425687e-06, "loss": 0.0067, "step": 2598 }, { "epoch": 0.741089250071286, "grad_norm": 0.9737064422050019, "learning_rate": 1.6580097239786096e-06, "loss": 0.0215, "step": 2599 }, { "epoch": 0.7413743940690048, "grad_norm": 2.1748723955431237, "learning_rate": 1.654575795320223e-06, "loss": 0.0546, "step": 2600 }, { "epoch": 0.7416595380667237, "grad_norm": 1.4285912646142453, "learning_rate": 1.6511447212117786e-06, "loss": 0.0346, "step": 2601 }, { "epoch": 0.7419446820644425, "grad_norm": 2.7288639089504745, "learning_rate": 1.6477165045809052e-06, "loss": 0.0358, "step": 2602 }, { "epoch": 0.7422298260621614, "grad_norm": 2.3650702688759018, "learning_rate": 1.6442911483527978e-06, "loss": 0.0669, "step": 2603 }, { "epoch": 0.7425149700598802, "grad_norm": 1.2690767225620623, "learning_rate": 1.6408686554502124e-06, "loss": 0.0169, "step": 2604 }, { "epoch": 0.742800114057599, "grad_norm": 1.4869660462533452, "learning_rate": 1.637449028793458e-06, "loss": 0.0294, "step": 2605 }, { "epoch": 0.7430852580553179, "grad_norm": 1.3737598171297696, "learning_rate": 1.6340322713003992e-06, "loss": 0.026, "step": 2606 }, { "epoch": 0.7433704020530368, "grad_norm": 0.5276542234582096, "learning_rate": 1.6306183858864528e-06, "loss": 0.0108, "step": 2607 }, { "epoch": 0.7436555460507557, "grad_norm": 1.208035745863832, "learning_rate": 1.6272073754645845e-06, "loss": 0.0434, "step": 2608 }, { "epoch": 0.7439406900484745, "grad_norm": 1.2060462983723554, "learning_rate": 1.623799242945307e-06, "loss": 0.0599, "step": 2609 }, { "epoch": 0.7442258340461934, "grad_norm": 1.613815953946869, "learning_rate": 1.6203939912366768e-06, "loss": 0.0241, "step": 2610 }, { "epoch": 0.7445109780439122, "grad_norm": 1.460570451296447, "learning_rate": 1.6169916232442923e-06, "loss": 0.0306, "step": 2611 }, { "epoch": 0.744796122041631, "grad_norm": 1.0008169723964753, "learning_rate": 1.6135921418712959e-06, "loss": 0.0215, "step": 2612 }, { "epoch": 0.7450812660393499, "grad_norm": 2.086617218943197, "learning_rate": 1.610195550018358e-06, "loss": 0.0274, "step": 2613 }, { "epoch": 0.7453664100370687, "grad_norm": 1.6041573788510077, "learning_rate": 1.6068018505836901e-06, "loss": 0.019, "step": 2614 }, { "epoch": 0.7456515540347876, "grad_norm": 0.7713053016927894, "learning_rate": 1.6034110464630325e-06, "loss": 0.0232, "step": 2615 }, { "epoch": 0.7459366980325064, "grad_norm": 0.5511320080746058, "learning_rate": 1.6000231405496602e-06, "loss": 0.0191, "step": 2616 }, { "epoch": 0.7462218420302252, "grad_norm": 1.0634637982976405, "learning_rate": 1.5966381357343708e-06, "loss": 0.0175, "step": 2617 }, { "epoch": 0.7465069860279441, "grad_norm": 1.0374330207474594, "learning_rate": 1.5932560349054838e-06, "loss": 0.0245, "step": 2618 }, { "epoch": 0.7467921300256629, "grad_norm": 1.0108319276571467, "learning_rate": 1.5898768409488447e-06, "loss": 0.0136, "step": 2619 }, { "epoch": 0.7470772740233818, "grad_norm": 1.6396915594949968, "learning_rate": 1.5865005567478215e-06, "loss": 0.0251, "step": 2620 }, { "epoch": 0.7473624180211007, "grad_norm": 0.9627999719461374, "learning_rate": 1.5831271851832937e-06, "loss": 0.0391, "step": 2621 }, { "epoch": 0.7476475620188195, "grad_norm": 0.8253787374724513, "learning_rate": 1.5797567291336586e-06, "loss": 0.0122, "step": 2622 }, { "epoch": 0.7479327060165384, "grad_norm": 0.6661195439268087, "learning_rate": 1.5763891914748241e-06, "loss": 0.0221, "step": 2623 }, { "epoch": 0.7482178500142572, "grad_norm": 0.8779901773734643, "learning_rate": 1.5730245750802103e-06, "loss": 0.0195, "step": 2624 }, { "epoch": 0.7485029940119761, "grad_norm": 1.6634272557568168, "learning_rate": 1.5696628828207421e-06, "loss": 0.037, "step": 2625 }, { "epoch": 0.7487881380096949, "grad_norm": 1.4516278438233137, "learning_rate": 1.5663041175648513e-06, "loss": 0.0219, "step": 2626 }, { "epoch": 0.7490732820074137, "grad_norm": 1.907475947775475, "learning_rate": 1.5629482821784691e-06, "loss": 0.0314, "step": 2627 }, { "epoch": 0.7493584260051326, "grad_norm": 1.2003200972703238, "learning_rate": 1.5595953795250346e-06, "loss": 0.0242, "step": 2628 }, { "epoch": 0.7496435700028514, "grad_norm": 1.1021426342768983, "learning_rate": 1.556245412465473e-06, "loss": 0.034, "step": 2629 }, { "epoch": 0.7499287140005703, "grad_norm": 1.0302550777919475, "learning_rate": 1.5528983838582129e-06, "loss": 0.0285, "step": 2630 }, { "epoch": 0.7502138579982891, "grad_norm": 0.8308789044386139, "learning_rate": 1.5495542965591709e-06, "loss": 0.0114, "step": 2631 }, { "epoch": 0.7504990019960079, "grad_norm": 0.8819505544759334, "learning_rate": 1.5462131534217607e-06, "loss": 0.0159, "step": 2632 }, { "epoch": 0.7507841459937268, "grad_norm": 0.9835443580092531, "learning_rate": 1.542874957296876e-06, "loss": 0.0154, "step": 2633 }, { "epoch": 0.7510692899914457, "grad_norm": 1.0665665377487932, "learning_rate": 1.5395397110329024e-06, "loss": 0.0353, "step": 2634 }, { "epoch": 0.7513544339891646, "grad_norm": 0.8164632784798294, "learning_rate": 1.5362074174756998e-06, "loss": 0.0134, "step": 2635 }, { "epoch": 0.7516395779868834, "grad_norm": 1.646300986945349, "learning_rate": 1.5328780794686188e-06, "loss": 0.0394, "step": 2636 }, { "epoch": 0.7519247219846023, "grad_norm": 1.1301313594828286, "learning_rate": 1.5295516998524823e-06, "loss": 0.0142, "step": 2637 }, { "epoch": 0.7522098659823211, "grad_norm": 0.5764109215471761, "learning_rate": 1.5262282814655893e-06, "loss": 0.0142, "step": 2638 }, { "epoch": 0.7524950099800399, "grad_norm": 1.3610524683318708, "learning_rate": 1.5229078271437141e-06, "loss": 0.034, "step": 2639 }, { "epoch": 0.7527801539777588, "grad_norm": 0.477649205446183, "learning_rate": 1.5195903397200996e-06, "loss": 0.0109, "step": 2640 }, { "epoch": 0.7530652979754776, "grad_norm": 1.515573238930559, "learning_rate": 1.5162758220254586e-06, "loss": 0.0166, "step": 2641 }, { "epoch": 0.7533504419731964, "grad_norm": 0.7091310562536813, "learning_rate": 1.5129642768879687e-06, "loss": 0.0135, "step": 2642 }, { "epoch": 0.7536355859709153, "grad_norm": 1.5074902375844876, "learning_rate": 1.5096557071332712e-06, "loss": 0.0253, "step": 2643 }, { "epoch": 0.7539207299686341, "grad_norm": 1.0767879630125734, "learning_rate": 1.5063501155844723e-06, "loss": 0.0167, "step": 2644 }, { "epoch": 0.754205873966353, "grad_norm": 3.1348957847232226, "learning_rate": 1.5030475050621336e-06, "loss": 0.0288, "step": 2645 }, { "epoch": 0.7544910179640718, "grad_norm": 0.6684133082529641, "learning_rate": 1.49974787838427e-06, "loss": 0.0084, "step": 2646 }, { "epoch": 0.7547761619617908, "grad_norm": 0.7727753084015943, "learning_rate": 1.4964512383663544e-06, "loss": 0.0219, "step": 2647 }, { "epoch": 0.7550613059595096, "grad_norm": 2.0188024106365887, "learning_rate": 1.4931575878213127e-06, "loss": 0.0395, "step": 2648 }, { "epoch": 0.7553464499572284, "grad_norm": 0.4906606761025358, "learning_rate": 1.4898669295595181e-06, "loss": 0.0069, "step": 2649 }, { "epoch": 0.7556315939549473, "grad_norm": 1.260687367143981, "learning_rate": 1.4865792663887907e-06, "loss": 0.0319, "step": 2650 }, { "epoch": 0.7559167379526661, "grad_norm": 1.0883273196471233, "learning_rate": 1.4832946011143906e-06, "loss": 0.0264, "step": 2651 }, { "epoch": 0.756201881950385, "grad_norm": 0.35999385019644614, "learning_rate": 1.4800129365390282e-06, "loss": 0.0055, "step": 2652 }, { "epoch": 0.7564870259481038, "grad_norm": 0.4561544926370098, "learning_rate": 1.4767342754628477e-06, "loss": 0.0095, "step": 2653 }, { "epoch": 0.7567721699458226, "grad_norm": 0.8901887574735574, "learning_rate": 1.4734586206834323e-06, "loss": 0.0198, "step": 2654 }, { "epoch": 0.7570573139435415, "grad_norm": 0.7083720617572299, "learning_rate": 1.4701859749958004e-06, "loss": 0.0152, "step": 2655 }, { "epoch": 0.7573424579412603, "grad_norm": 1.1369840531128947, "learning_rate": 1.466916341192401e-06, "loss": 0.0141, "step": 2656 }, { "epoch": 0.7576276019389792, "grad_norm": 1.2957588766875028, "learning_rate": 1.4636497220631145e-06, "loss": 0.0336, "step": 2657 }, { "epoch": 0.757912745936698, "grad_norm": 2.0713263046299173, "learning_rate": 1.4603861203952502e-06, "loss": 0.0302, "step": 2658 }, { "epoch": 0.7581978899344168, "grad_norm": 1.1733273590818454, "learning_rate": 1.4571255389735385e-06, "loss": 0.0234, "step": 2659 }, { "epoch": 0.7584830339321357, "grad_norm": 2.1819052963632717, "learning_rate": 1.4538679805801386e-06, "loss": 0.0329, "step": 2660 }, { "epoch": 0.7587681779298546, "grad_norm": 0.3495163345720255, "learning_rate": 1.4506134479946281e-06, "loss": 0.0098, "step": 2661 }, { "epoch": 0.7590533219275735, "grad_norm": 0.5490129444641568, "learning_rate": 1.4473619439939985e-06, "loss": 0.0065, "step": 2662 }, { "epoch": 0.7593384659252923, "grad_norm": 1.581308447857722, "learning_rate": 1.4441134713526595e-06, "loss": 0.057, "step": 2663 }, { "epoch": 0.7596236099230111, "grad_norm": 1.0260023381149048, "learning_rate": 1.440868032842439e-06, "loss": 0.0185, "step": 2664 }, { "epoch": 0.75990875392073, "grad_norm": 0.5787379526406348, "learning_rate": 1.437625631232571e-06, "loss": 0.0151, "step": 2665 }, { "epoch": 0.7601938979184488, "grad_norm": 1.5032898745586536, "learning_rate": 1.4343862692896986e-06, "loss": 0.0191, "step": 2666 }, { "epoch": 0.7604790419161677, "grad_norm": 0.9364587983512026, "learning_rate": 1.431149949777873e-06, "loss": 0.0138, "step": 2667 }, { "epoch": 0.7607641859138865, "grad_norm": 0.7579871237730098, "learning_rate": 1.4279166754585472e-06, "loss": 0.0236, "step": 2668 }, { "epoch": 0.7610493299116053, "grad_norm": 1.2909526522887504, "learning_rate": 1.4246864490905776e-06, "loss": 0.0228, "step": 2669 }, { "epoch": 0.7613344739093242, "grad_norm": 1.313113386385982, "learning_rate": 1.421459273430219e-06, "loss": 0.0234, "step": 2670 }, { "epoch": 0.761619617907043, "grad_norm": 0.8158285471748641, "learning_rate": 1.4182351512311237e-06, "loss": 0.027, "step": 2671 }, { "epoch": 0.7619047619047619, "grad_norm": 1.1047514162563927, "learning_rate": 1.4150140852443389e-06, "loss": 0.016, "step": 2672 }, { "epoch": 0.7621899059024807, "grad_norm": 0.7487315401677382, "learning_rate": 1.4117960782183021e-06, "loss": 0.0075, "step": 2673 }, { "epoch": 0.7624750499001997, "grad_norm": 1.5926773015253632, "learning_rate": 1.4085811328988425e-06, "loss": 0.0245, "step": 2674 }, { "epoch": 0.7627601938979185, "grad_norm": 1.0230513489809057, "learning_rate": 1.405369252029175e-06, "loss": 0.0108, "step": 2675 }, { "epoch": 0.7630453378956373, "grad_norm": 0.6835988253710413, "learning_rate": 1.4021604383499044e-06, "loss": 0.0141, "step": 2676 }, { "epoch": 0.7633304818933562, "grad_norm": 1.2256591335523719, "learning_rate": 1.3989546945990129e-06, "loss": 0.0158, "step": 2677 }, { "epoch": 0.763615625891075, "grad_norm": 1.2930161491847667, "learning_rate": 1.395752023511867e-06, "loss": 0.0195, "step": 2678 }, { "epoch": 0.7639007698887939, "grad_norm": 0.8994233014350279, "learning_rate": 1.392552427821205e-06, "loss": 0.0087, "step": 2679 }, { "epoch": 0.7641859138865127, "grad_norm": 0.6770559844138675, "learning_rate": 1.3893559102571502e-06, "loss": 0.0141, "step": 2680 }, { "epoch": 0.7644710578842315, "grad_norm": 1.4930856086234503, "learning_rate": 1.386162473547194e-06, "loss": 0.0343, "step": 2681 }, { "epoch": 0.7647562018819504, "grad_norm": 1.682448635821019, "learning_rate": 1.3829721204162e-06, "loss": 0.0297, "step": 2682 }, { "epoch": 0.7650413458796692, "grad_norm": 1.5491061182481676, "learning_rate": 1.3797848535863995e-06, "loss": 0.0326, "step": 2683 }, { "epoch": 0.765326489877388, "grad_norm": 1.175627444484784, "learning_rate": 1.3766006757773932e-06, "loss": 0.0218, "step": 2684 }, { "epoch": 0.7656116338751069, "grad_norm": 0.9840348835612843, "learning_rate": 1.373419589706143e-06, "loss": 0.0211, "step": 2685 }, { "epoch": 0.7658967778728257, "grad_norm": 1.0057768113881969, "learning_rate": 1.3702415980869743e-06, "loss": 0.0464, "step": 2686 }, { "epoch": 0.7661819218705447, "grad_norm": 1.7830889645336818, "learning_rate": 1.3670667036315728e-06, "loss": 0.0185, "step": 2687 }, { "epoch": 0.7664670658682635, "grad_norm": 0.4224535629956841, "learning_rate": 1.3638949090489772e-06, "loss": 0.0081, "step": 2688 }, { "epoch": 0.7667522098659824, "grad_norm": 0.6377033854455558, "learning_rate": 1.360726217045591e-06, "loss": 0.01, "step": 2689 }, { "epoch": 0.7670373538637012, "grad_norm": 1.6543881718541624, "learning_rate": 1.3575606303251582e-06, "loss": 0.0274, "step": 2690 }, { "epoch": 0.76732249786142, "grad_norm": 0.705004441035942, "learning_rate": 1.3543981515887788e-06, "loss": 0.0168, "step": 2691 }, { "epoch": 0.7676076418591389, "grad_norm": 1.5935922533656512, "learning_rate": 1.3512387835349045e-06, "loss": 0.0248, "step": 2692 }, { "epoch": 0.7678927858568577, "grad_norm": 0.7488281251727223, "learning_rate": 1.3480825288593274e-06, "loss": 0.0139, "step": 2693 }, { "epoch": 0.7681779298545766, "grad_norm": 0.8165161964838176, "learning_rate": 1.3449293902551857e-06, "loss": 0.0199, "step": 2694 }, { "epoch": 0.7684630738522954, "grad_norm": 1.5726186357766658, "learning_rate": 1.341779370412954e-06, "loss": 0.0364, "step": 2695 }, { "epoch": 0.7687482178500142, "grad_norm": 0.8244759122464347, "learning_rate": 1.3386324720204542e-06, "loss": 0.0119, "step": 2696 }, { "epoch": 0.7690333618477331, "grad_norm": 1.5946273315050323, "learning_rate": 1.3354886977628374e-06, "loss": 0.0267, "step": 2697 }, { "epoch": 0.7693185058454519, "grad_norm": 0.8033194869459227, "learning_rate": 1.3323480503225933e-06, "loss": 0.0086, "step": 2698 }, { "epoch": 0.7696036498431708, "grad_norm": 1.1393108615672012, "learning_rate": 1.3292105323795406e-06, "loss": 0.0277, "step": 2699 }, { "epoch": 0.7698887938408896, "grad_norm": 1.9860907344454382, "learning_rate": 1.3260761466108302e-06, "loss": 0.0211, "step": 2700 }, { "epoch": 0.7701739378386085, "grad_norm": 1.5082456222402305, "learning_rate": 1.3229448956909385e-06, "loss": 0.022, "step": 2701 }, { "epoch": 0.7704590818363274, "grad_norm": 1.5344921371462583, "learning_rate": 1.3198167822916685e-06, "loss": 0.0245, "step": 2702 }, { "epoch": 0.7707442258340462, "grad_norm": 0.7564484297441108, "learning_rate": 1.3166918090821445e-06, "loss": 0.0141, "step": 2703 }, { "epoch": 0.7710293698317651, "grad_norm": 0.43057532343881677, "learning_rate": 1.3135699787288114e-06, "loss": 0.0059, "step": 2704 }, { "epoch": 0.7713145138294839, "grad_norm": 1.0858462972323473, "learning_rate": 1.3104512938954373e-06, "loss": 0.0327, "step": 2705 }, { "epoch": 0.7715996578272027, "grad_norm": 1.1709018893965801, "learning_rate": 1.3073357572430984e-06, "loss": 0.032, "step": 2706 }, { "epoch": 0.7718848018249216, "grad_norm": 0.28314832345320107, "learning_rate": 1.3042233714301893e-06, "loss": 0.0063, "step": 2707 }, { "epoch": 0.7721699458226404, "grad_norm": 1.0670337951747642, "learning_rate": 1.3011141391124138e-06, "loss": 0.017, "step": 2708 }, { "epoch": 0.7724550898203593, "grad_norm": 1.5924118733503132, "learning_rate": 1.2980080629427904e-06, "loss": 0.0261, "step": 2709 }, { "epoch": 0.7727402338180781, "grad_norm": 0.8829070105724145, "learning_rate": 1.2949051455716378e-06, "loss": 0.0107, "step": 2710 }, { "epoch": 0.7730253778157969, "grad_norm": 0.4216528848394497, "learning_rate": 1.2918053896465826e-06, "loss": 0.012, "step": 2711 }, { "epoch": 0.7733105218135158, "grad_norm": 0.7050612966316768, "learning_rate": 1.2887087978125535e-06, "loss": 0.0093, "step": 2712 }, { "epoch": 0.7735956658112346, "grad_norm": 1.1605581061863133, "learning_rate": 1.2856153727117792e-06, "loss": 0.0212, "step": 2713 }, { "epoch": 0.7738808098089536, "grad_norm": 1.3212374854178839, "learning_rate": 1.2825251169837865e-06, "loss": 0.0224, "step": 2714 }, { "epoch": 0.7741659538066724, "grad_norm": 1.1885252640350221, "learning_rate": 1.2794380332653976e-06, "loss": 0.0208, "step": 2715 }, { "epoch": 0.7744510978043913, "grad_norm": 1.7760519011776683, "learning_rate": 1.2763541241907268e-06, "loss": 0.0569, "step": 2716 }, { "epoch": 0.7747362418021101, "grad_norm": 0.9620142840295433, "learning_rate": 1.2732733923911854e-06, "loss": 0.0161, "step": 2717 }, { "epoch": 0.7750213857998289, "grad_norm": 1.1933906064161974, "learning_rate": 1.270195840495465e-06, "loss": 0.0181, "step": 2718 }, { "epoch": 0.7753065297975478, "grad_norm": 1.0984270689396793, "learning_rate": 1.2671214711295492e-06, "loss": 0.0144, "step": 2719 }, { "epoch": 0.7755916737952666, "grad_norm": 0.7603521204159235, "learning_rate": 1.2640502869167043e-06, "loss": 0.014, "step": 2720 }, { "epoch": 0.7758768177929855, "grad_norm": 0.42403568104580863, "learning_rate": 1.260982290477482e-06, "loss": 0.0083, "step": 2721 }, { "epoch": 0.7761619617907043, "grad_norm": 0.865345796891974, "learning_rate": 1.2579174844297114e-06, "loss": 0.0112, "step": 2722 }, { "epoch": 0.7764471057884231, "grad_norm": 1.3649714875018568, "learning_rate": 1.2548558713884963e-06, "loss": 0.0109, "step": 2723 }, { "epoch": 0.776732249786142, "grad_norm": 0.8805998540651545, "learning_rate": 1.2517974539662198e-06, "loss": 0.0181, "step": 2724 }, { "epoch": 0.7770173937838608, "grad_norm": 1.133487480597252, "learning_rate": 1.2487422347725397e-06, "loss": 0.0439, "step": 2725 }, { "epoch": 0.7773025377815796, "grad_norm": 1.077854176177537, "learning_rate": 1.2456902164143813e-06, "loss": 0.0107, "step": 2726 }, { "epoch": 0.7775876817792986, "grad_norm": 1.4419228139613494, "learning_rate": 1.2426414014959409e-06, "loss": 0.0252, "step": 2727 }, { "epoch": 0.7778728257770174, "grad_norm": 0.40646062715513975, "learning_rate": 1.2395957926186802e-06, "loss": 0.0048, "step": 2728 }, { "epoch": 0.7781579697747363, "grad_norm": 0.8225502003586711, "learning_rate": 1.2365533923813256e-06, "loss": 0.0199, "step": 2729 }, { "epoch": 0.7784431137724551, "grad_norm": 1.59562236103222, "learning_rate": 1.2335142033798658e-06, "loss": 0.0508, "step": 2730 }, { "epoch": 0.778728257770174, "grad_norm": 1.233387689768871, "learning_rate": 1.23047822820755e-06, "loss": 0.0287, "step": 2731 }, { "epoch": 0.7790134017678928, "grad_norm": 0.5500387337491803, "learning_rate": 1.227445469454882e-06, "loss": 0.0065, "step": 2732 }, { "epoch": 0.7792985457656116, "grad_norm": 2.0299603071873533, "learning_rate": 1.2244159297096291e-06, "loss": 0.0415, "step": 2733 }, { "epoch": 0.7795836897633305, "grad_norm": 1.353228704932132, "learning_rate": 1.2213896115568024e-06, "loss": 0.0387, "step": 2734 }, { "epoch": 0.7798688337610493, "grad_norm": 1.6415096320435094, "learning_rate": 1.2183665175786686e-06, "loss": 0.0424, "step": 2735 }, { "epoch": 0.7801539777587682, "grad_norm": 2.374020647253202, "learning_rate": 1.2153466503547417e-06, "loss": 0.0519, "step": 2736 }, { "epoch": 0.780439121756487, "grad_norm": 1.8977748402119863, "learning_rate": 1.2123300124617876e-06, "loss": 0.0722, "step": 2737 }, { "epoch": 0.7807242657542058, "grad_norm": 0.7295480552079158, "learning_rate": 1.2093166064738098e-06, "loss": 0.0135, "step": 2738 }, { "epoch": 0.7810094097519247, "grad_norm": 0.8229549593104721, "learning_rate": 1.2063064349620596e-06, "loss": 0.0259, "step": 2739 }, { "epoch": 0.7812945537496435, "grad_norm": 0.6925997286050343, "learning_rate": 1.2032995004950216e-06, "loss": 0.0146, "step": 2740 }, { "epoch": 0.7815796977473625, "grad_norm": 0.9014027542875058, "learning_rate": 1.2002958056384262e-06, "loss": 0.0234, "step": 2741 }, { "epoch": 0.7818648417450813, "grad_norm": 0.706354278112295, "learning_rate": 1.197295352955235e-06, "loss": 0.0086, "step": 2742 }, { "epoch": 0.7821499857428001, "grad_norm": 0.8645190729336512, "learning_rate": 1.194298145005644e-06, "loss": 0.017, "step": 2743 }, { "epoch": 0.782435129740519, "grad_norm": 0.37228866967958824, "learning_rate": 1.1913041843470807e-06, "loss": 0.0096, "step": 2744 }, { "epoch": 0.7827202737382378, "grad_norm": 0.6034253285490447, "learning_rate": 1.1883134735342023e-06, "loss": 0.0153, "step": 2745 }, { "epoch": 0.7830054177359567, "grad_norm": 1.1035155409154056, "learning_rate": 1.1853260151188912e-06, "loss": 0.0318, "step": 2746 }, { "epoch": 0.7832905617336755, "grad_norm": 0.7012984171334737, "learning_rate": 1.1823418116502566e-06, "loss": 0.0133, "step": 2747 }, { "epoch": 0.7835757057313943, "grad_norm": 2.2145145784676674, "learning_rate": 1.1793608656746286e-06, "loss": 0.0276, "step": 2748 }, { "epoch": 0.7838608497291132, "grad_norm": 1.4050671677912367, "learning_rate": 1.1763831797355612e-06, "loss": 0.0201, "step": 2749 }, { "epoch": 0.784145993726832, "grad_norm": 0.8314849154599873, "learning_rate": 1.1734087563738245e-06, "loss": 0.0146, "step": 2750 }, { "epoch": 0.7844311377245509, "grad_norm": 1.6532863006133396, "learning_rate": 1.1704375981274019e-06, "loss": 0.0234, "step": 2751 }, { "epoch": 0.7847162817222697, "grad_norm": 0.6670573448259365, "learning_rate": 1.1674697075314928e-06, "loss": 0.0135, "step": 2752 }, { "epoch": 0.7850014257199885, "grad_norm": 0.6240914012825134, "learning_rate": 1.1645050871185121e-06, "loss": 0.0116, "step": 2753 }, { "epoch": 0.7852865697177075, "grad_norm": 0.9870483425644136, "learning_rate": 1.1615437394180806e-06, "loss": 0.0107, "step": 2754 }, { "epoch": 0.7855717137154263, "grad_norm": 1.9210648543710227, "learning_rate": 1.1585856669570277e-06, "loss": 0.0399, "step": 2755 }, { "epoch": 0.7858568577131452, "grad_norm": 1.7634186455826524, "learning_rate": 1.1556308722593845e-06, "loss": 0.0589, "step": 2756 }, { "epoch": 0.786142001710864, "grad_norm": 1.2268952626774527, "learning_rate": 1.152679357846392e-06, "loss": 0.0232, "step": 2757 }, { "epoch": 0.7864271457085829, "grad_norm": 1.456806678520593, "learning_rate": 1.1497311262364874e-06, "loss": 0.0243, "step": 2758 }, { "epoch": 0.7867122897063017, "grad_norm": 0.4950354070457815, "learning_rate": 1.1467861799453084e-06, "loss": 0.0072, "step": 2759 }, { "epoch": 0.7869974337040205, "grad_norm": 0.8080613169821007, "learning_rate": 1.143844521485688e-06, "loss": 0.016, "step": 2760 }, { "epoch": 0.7872825777017394, "grad_norm": 1.600494453613366, "learning_rate": 1.1409061533676586e-06, "loss": 0.0267, "step": 2761 }, { "epoch": 0.7875677216994582, "grad_norm": 1.9230161162665296, "learning_rate": 1.1379710780984376e-06, "loss": 0.0278, "step": 2762 }, { "epoch": 0.787852865697177, "grad_norm": 1.04041200038653, "learning_rate": 1.1350392981824375e-06, "loss": 0.0226, "step": 2763 }, { "epoch": 0.7881380096948959, "grad_norm": 0.20832185978045045, "learning_rate": 1.1321108161212574e-06, "loss": 0.0049, "step": 2764 }, { "epoch": 0.7884231536926147, "grad_norm": 0.6598830632065913, "learning_rate": 1.1291856344136853e-06, "loss": 0.0072, "step": 2765 }, { "epoch": 0.7887082976903336, "grad_norm": 1.6941306324384335, "learning_rate": 1.1262637555556905e-06, "loss": 0.0444, "step": 2766 }, { "epoch": 0.7889934416880525, "grad_norm": 2.3563085514868853, "learning_rate": 1.1233451820404222e-06, "loss": 0.0641, "step": 2767 }, { "epoch": 0.7892785856857714, "grad_norm": 0.5794506056340978, "learning_rate": 1.1204299163582117e-06, "loss": 0.0099, "step": 2768 }, { "epoch": 0.7895637296834902, "grad_norm": 0.5242308434486851, "learning_rate": 1.1175179609965697e-06, "loss": 0.0082, "step": 2769 }, { "epoch": 0.789848873681209, "grad_norm": 1.1466102878255973, "learning_rate": 1.1146093184401791e-06, "loss": 0.0196, "step": 2770 }, { "epoch": 0.7901340176789279, "grad_norm": 1.8361430845752722, "learning_rate": 1.1117039911708966e-06, "loss": 0.039, "step": 2771 }, { "epoch": 0.7904191616766467, "grad_norm": 2.1710130177501252, "learning_rate": 1.108801981667752e-06, "loss": 0.0378, "step": 2772 }, { "epoch": 0.7907043056743656, "grad_norm": 1.0607481844235185, "learning_rate": 1.1059032924069419e-06, "loss": 0.0198, "step": 2773 }, { "epoch": 0.7909894496720844, "grad_norm": 1.1534272328002837, "learning_rate": 1.1030079258618303e-06, "loss": 0.0181, "step": 2774 }, { "epoch": 0.7912745936698032, "grad_norm": 1.4451323963964195, "learning_rate": 1.1001158845029475e-06, "loss": 0.028, "step": 2775 }, { "epoch": 0.7915597376675221, "grad_norm": 1.0170979316582653, "learning_rate": 1.0972271707979837e-06, "loss": 0.0151, "step": 2776 }, { "epoch": 0.7918448816652409, "grad_norm": 0.4474136512465243, "learning_rate": 1.0943417872117956e-06, "loss": 0.0113, "step": 2777 }, { "epoch": 0.7921300256629598, "grad_norm": 1.0050859141496182, "learning_rate": 1.09145973620639e-06, "loss": 0.0114, "step": 2778 }, { "epoch": 0.7924151696606786, "grad_norm": 1.7556272164199616, "learning_rate": 1.0885810202409358e-06, "loss": 0.0284, "step": 2779 }, { "epoch": 0.7927003136583974, "grad_norm": 0.8890931457178195, "learning_rate": 1.0857056417717538e-06, "loss": 0.0171, "step": 2780 }, { "epoch": 0.7929854576561164, "grad_norm": 0.9710326204173798, "learning_rate": 1.0828336032523206e-06, "loss": 0.0303, "step": 2781 }, { "epoch": 0.7932706016538352, "grad_norm": 1.23379631524315, "learning_rate": 1.0799649071332585e-06, "loss": 0.0203, "step": 2782 }, { "epoch": 0.7935557456515541, "grad_norm": 1.0048302038142245, "learning_rate": 1.077099555862342e-06, "loss": 0.0184, "step": 2783 }, { "epoch": 0.7938408896492729, "grad_norm": 0.5781905231147105, "learning_rate": 1.0742375518844845e-06, "loss": 0.0065, "step": 2784 }, { "epoch": 0.7941260336469917, "grad_norm": 0.6193101566716163, "learning_rate": 1.0713788976417522e-06, "loss": 0.0142, "step": 2785 }, { "epoch": 0.7944111776447106, "grad_norm": 0.4952614177724453, "learning_rate": 1.068523595573348e-06, "loss": 0.0058, "step": 2786 }, { "epoch": 0.7946963216424294, "grad_norm": 1.2992089082042755, "learning_rate": 1.0656716481156144e-06, "loss": 0.0278, "step": 2787 }, { "epoch": 0.7949814656401483, "grad_norm": 1.0260825841582102, "learning_rate": 1.0628230577020327e-06, "loss": 0.0125, "step": 2788 }, { "epoch": 0.7952666096378671, "grad_norm": 1.650422933470823, "learning_rate": 1.05997782676322e-06, "loss": 0.0284, "step": 2789 }, { "epoch": 0.795551753635586, "grad_norm": 1.6922243882463914, "learning_rate": 1.0571359577269263e-06, "loss": 0.034, "step": 2790 }, { "epoch": 0.7958368976333048, "grad_norm": 2.064635080746134, "learning_rate": 1.0542974530180327e-06, "loss": 0.0331, "step": 2791 }, { "epoch": 0.7961220416310236, "grad_norm": 0.6979973531032883, "learning_rate": 1.0514623150585484e-06, "loss": 0.015, "step": 2792 }, { "epoch": 0.7964071856287425, "grad_norm": 1.9320077547472618, "learning_rate": 1.048630546267615e-06, "loss": 0.048, "step": 2793 }, { "epoch": 0.7966923296264614, "grad_norm": 0.9851340654285308, "learning_rate": 1.045802149061495e-06, "loss": 0.0095, "step": 2794 }, { "epoch": 0.7969774736241803, "grad_norm": 0.33088819901168887, "learning_rate": 1.0429771258535726e-06, "loss": 0.007, "step": 2795 }, { "epoch": 0.7972626176218991, "grad_norm": 1.4719706011002405, "learning_rate": 1.0401554790543545e-06, "loss": 0.0243, "step": 2796 }, { "epoch": 0.7975477616196179, "grad_norm": 2.20284051649092, "learning_rate": 1.0373372110714697e-06, "loss": 0.0381, "step": 2797 }, { "epoch": 0.7978329056173368, "grad_norm": 0.9042831305808354, "learning_rate": 1.0345223243096614e-06, "loss": 0.0133, "step": 2798 }, { "epoch": 0.7981180496150556, "grad_norm": 0.4357457490944433, "learning_rate": 1.0317108211707883e-06, "loss": 0.0059, "step": 2799 }, { "epoch": 0.7984031936127745, "grad_norm": 1.4454010907254553, "learning_rate": 1.0289027040538174e-06, "loss": 0.0271, "step": 2800 }, { "epoch": 0.7986883376104933, "grad_norm": 0.8325294383532024, "learning_rate": 1.0260979753548356e-06, "loss": 0.0242, "step": 2801 }, { "epoch": 0.7989734816082121, "grad_norm": 0.6799031003016142, "learning_rate": 1.023296637467031e-06, "loss": 0.0115, "step": 2802 }, { "epoch": 0.799258625605931, "grad_norm": 0.8000424760149701, "learning_rate": 1.0204986927807026e-06, "loss": 0.0139, "step": 2803 }, { "epoch": 0.7995437696036498, "grad_norm": 1.0437386516730733, "learning_rate": 1.0177041436832508e-06, "loss": 0.0234, "step": 2804 }, { "epoch": 0.7998289136013687, "grad_norm": 0.9328110888666781, "learning_rate": 1.0149129925591816e-06, "loss": 0.0411, "step": 2805 }, { "epoch": 0.8001140575990875, "grad_norm": 0.8919536940706596, "learning_rate": 1.0121252417901e-06, "loss": 0.0243, "step": 2806 }, { "epoch": 0.8003992015968064, "grad_norm": 1.7879385125082785, "learning_rate": 1.00934089375471e-06, "loss": 0.0299, "step": 2807 }, { "epoch": 0.8006843455945253, "grad_norm": 1.0080187097759719, "learning_rate": 1.006559950828812e-06, "loss": 0.0275, "step": 2808 }, { "epoch": 0.8009694895922441, "grad_norm": 2.297194259534309, "learning_rate": 1.0037824153852993e-06, "loss": 0.0589, "step": 2809 }, { "epoch": 0.801254633589963, "grad_norm": 1.3722790217531777, "learning_rate": 1.0010082897941642e-06, "loss": 0.0468, "step": 2810 }, { "epoch": 0.8015397775876818, "grad_norm": 2.880885415554408, "learning_rate": 9.982375764224805e-07, "loss": 0.0544, "step": 2811 }, { "epoch": 0.8018249215854006, "grad_norm": 1.3779641066305213, "learning_rate": 9.954702776344144e-07, "loss": 0.0498, "step": 2812 }, { "epoch": 0.8021100655831195, "grad_norm": 1.578974235716462, "learning_rate": 9.927063957912214e-07, "loss": 0.0309, "step": 2813 }, { "epoch": 0.8023952095808383, "grad_norm": 0.5250722169025638, "learning_rate": 9.89945933251238e-07, "loss": 0.0073, "step": 2814 }, { "epoch": 0.8026803535785572, "grad_norm": 1.214875683960437, "learning_rate": 9.871888923698836e-07, "loss": 0.0313, "step": 2815 }, { "epoch": 0.802965497576276, "grad_norm": 1.899479207342997, "learning_rate": 9.844352754996578e-07, "loss": 0.0394, "step": 2816 }, { "epoch": 0.8032506415739948, "grad_norm": 1.7956615761315469, "learning_rate": 9.816850849901404e-07, "loss": 0.037, "step": 2817 }, { "epoch": 0.8035357855717137, "grad_norm": 0.987617818563438, "learning_rate": 9.78938323187985e-07, "loss": 0.0249, "step": 2818 }, { "epoch": 0.8038209295694325, "grad_norm": 2.2873969718098177, "learning_rate": 9.761949924369217e-07, "loss": 0.0508, "step": 2819 }, { "epoch": 0.8041060735671514, "grad_norm": 0.6727560710264749, "learning_rate": 9.734550950777523e-07, "loss": 0.0135, "step": 2820 }, { "epoch": 0.8043912175648703, "grad_norm": 1.1083108655277412, "learning_rate": 9.707186334483477e-07, "loss": 0.0301, "step": 2821 }, { "epoch": 0.8046763615625891, "grad_norm": 1.4347478292304672, "learning_rate": 9.679856098836498e-07, "loss": 0.018, "step": 2822 }, { "epoch": 0.804961505560308, "grad_norm": 0.7012755730521804, "learning_rate": 9.652560267156647e-07, "loss": 0.0096, "step": 2823 }, { "epoch": 0.8052466495580268, "grad_norm": 0.76616377946827, "learning_rate": 9.625298862734645e-07, "loss": 0.0136, "step": 2824 }, { "epoch": 0.8055317935557457, "grad_norm": 1.510172264857123, "learning_rate": 9.598071908831802e-07, "loss": 0.0371, "step": 2825 }, { "epoch": 0.8058169375534645, "grad_norm": 0.9247247630615653, "learning_rate": 9.570879428680097e-07, "loss": 0.0162, "step": 2826 }, { "epoch": 0.8061020815511833, "grad_norm": 1.4471204790194672, "learning_rate": 9.543721445482051e-07, "loss": 0.0414, "step": 2827 }, { "epoch": 0.8063872255489022, "grad_norm": 1.1261570935477987, "learning_rate": 9.516597982410719e-07, "loss": 0.0222, "step": 2828 }, { "epoch": 0.806672369546621, "grad_norm": 1.213228339092222, "learning_rate": 9.489509062609776e-07, "loss": 0.019, "step": 2829 }, { "epoch": 0.8069575135443399, "grad_norm": 1.0568593386045195, "learning_rate": 9.46245470919337e-07, "loss": 0.0162, "step": 2830 }, { "epoch": 0.8072426575420587, "grad_norm": 1.0598129850467366, "learning_rate": 9.435434945246164e-07, "loss": 0.0115, "step": 2831 }, { "epoch": 0.8075278015397775, "grad_norm": 1.148087127959207, "learning_rate": 9.408449793823316e-07, "loss": 0.0186, "step": 2832 }, { "epoch": 0.8078129455374964, "grad_norm": 0.7838943287296171, "learning_rate": 9.38149927795044e-07, "loss": 0.0145, "step": 2833 }, { "epoch": 0.8080980895352153, "grad_norm": 0.6450557189362891, "learning_rate": 9.354583420623603e-07, "loss": 0.0178, "step": 2834 }, { "epoch": 0.8083832335329342, "grad_norm": 1.2580295549839888, "learning_rate": 9.327702244809295e-07, "loss": 0.0334, "step": 2835 }, { "epoch": 0.808668377530653, "grad_norm": 0.5841231963670347, "learning_rate": 9.300855773444422e-07, "loss": 0.0171, "step": 2836 }, { "epoch": 0.8089535215283719, "grad_norm": 1.0744692555164113, "learning_rate": 9.274044029436252e-07, "loss": 0.0231, "step": 2837 }, { "epoch": 0.8092386655260907, "grad_norm": 0.5347627958333485, "learning_rate": 9.247267035662483e-07, "loss": 0.0106, "step": 2838 }, { "epoch": 0.8095238095238095, "grad_norm": 0.8639389678174983, "learning_rate": 9.220524814971082e-07, "loss": 0.0098, "step": 2839 }, { "epoch": 0.8098089535215284, "grad_norm": 1.4394556398547473, "learning_rate": 9.193817390180387e-07, "loss": 0.0158, "step": 2840 }, { "epoch": 0.8100940975192472, "grad_norm": 1.7003245750972533, "learning_rate": 9.167144784079035e-07, "loss": 0.0287, "step": 2841 }, { "epoch": 0.810379241516966, "grad_norm": 1.3577296521728226, "learning_rate": 9.140507019425981e-07, "loss": 0.031, "step": 2842 }, { "epoch": 0.8106643855146849, "grad_norm": 1.3152852591455102, "learning_rate": 9.11390411895044e-07, "loss": 0.0271, "step": 2843 }, { "epoch": 0.8109495295124037, "grad_norm": 0.8196188314690228, "learning_rate": 9.087336105351813e-07, "loss": 0.0207, "step": 2844 }, { "epoch": 0.8112346735101226, "grad_norm": 0.7446020625868897, "learning_rate": 9.060803001299833e-07, "loss": 0.023, "step": 2845 }, { "epoch": 0.8115198175078414, "grad_norm": 1.6959863019895527, "learning_rate": 9.034304829434387e-07, "loss": 0.0425, "step": 2846 }, { "epoch": 0.8118049615055604, "grad_norm": 1.5565435765306315, "learning_rate": 9.007841612365575e-07, "loss": 0.0324, "step": 2847 }, { "epoch": 0.8120901055032792, "grad_norm": 1.4501860535298694, "learning_rate": 8.981413372673659e-07, "loss": 0.0276, "step": 2848 }, { "epoch": 0.812375249500998, "grad_norm": 0.8662824114962682, "learning_rate": 8.955020132909059e-07, "loss": 0.0222, "step": 2849 }, { "epoch": 0.8126603934987169, "grad_norm": 0.6872747440077209, "learning_rate": 8.928661915592335e-07, "loss": 0.0064, "step": 2850 }, { "epoch": 0.8129455374964357, "grad_norm": 0.8899013455152293, "learning_rate": 8.902338743214167e-07, "loss": 0.0133, "step": 2851 }, { "epoch": 0.8132306814941546, "grad_norm": 0.75181554380744, "learning_rate": 8.876050638235323e-07, "loss": 0.0204, "step": 2852 }, { "epoch": 0.8135158254918734, "grad_norm": 0.830293688967669, "learning_rate": 8.849797623086643e-07, "loss": 0.011, "step": 2853 }, { "epoch": 0.8138009694895922, "grad_norm": 1.0573364540507733, "learning_rate": 8.823579720169068e-07, "loss": 0.038, "step": 2854 }, { "epoch": 0.8140861134873111, "grad_norm": 1.2831202813735518, "learning_rate": 8.797396951853515e-07, "loss": 0.0322, "step": 2855 }, { "epoch": 0.8143712574850299, "grad_norm": 0.5543201708828293, "learning_rate": 8.771249340480959e-07, "loss": 0.0094, "step": 2856 }, { "epoch": 0.8146564014827488, "grad_norm": 0.9366333141939963, "learning_rate": 8.745136908362367e-07, "loss": 0.0124, "step": 2857 }, { "epoch": 0.8149415454804676, "grad_norm": 1.1448966352215733, "learning_rate": 8.719059677778712e-07, "loss": 0.0348, "step": 2858 }, { "epoch": 0.8152266894781864, "grad_norm": 0.6801824833631003, "learning_rate": 8.693017670980903e-07, "loss": 0.0153, "step": 2859 }, { "epoch": 0.8155118334759053, "grad_norm": 1.490041583362656, "learning_rate": 8.667010910189794e-07, "loss": 0.039, "step": 2860 }, { "epoch": 0.8157969774736242, "grad_norm": 1.3198201360592343, "learning_rate": 8.641039417596181e-07, "loss": 0.0233, "step": 2861 }, { "epoch": 0.8160821214713431, "grad_norm": 0.8793603147206949, "learning_rate": 8.615103215360754e-07, "loss": 0.0206, "step": 2862 }, { "epoch": 0.8163672654690619, "grad_norm": 0.9461948476174028, "learning_rate": 8.589202325614094e-07, "loss": 0.0248, "step": 2863 }, { "epoch": 0.8166524094667807, "grad_norm": 1.016498277100784, "learning_rate": 8.563336770456654e-07, "loss": 0.012, "step": 2864 }, { "epoch": 0.8169375534644996, "grad_norm": 2.102929530837856, "learning_rate": 8.537506571958736e-07, "loss": 0.0296, "step": 2865 }, { "epoch": 0.8172226974622184, "grad_norm": 1.5361923525365948, "learning_rate": 8.511711752160467e-07, "loss": 0.0197, "step": 2866 }, { "epoch": 0.8175078414599373, "grad_norm": 1.2400610374990475, "learning_rate": 8.485952333071801e-07, "loss": 0.0212, "step": 2867 }, { "epoch": 0.8177929854576561, "grad_norm": 0.9973748714157373, "learning_rate": 8.460228336672466e-07, "loss": 0.0389, "step": 2868 }, { "epoch": 0.818078129455375, "grad_norm": 2.49728872513892, "learning_rate": 8.434539784911966e-07, "loss": 0.0497, "step": 2869 }, { "epoch": 0.8183632734530938, "grad_norm": 1.3237326648218986, "learning_rate": 8.408886699709601e-07, "loss": 0.0351, "step": 2870 }, { "epoch": 0.8186484174508126, "grad_norm": 0.8497672015331845, "learning_rate": 8.383269102954367e-07, "loss": 0.0264, "step": 2871 }, { "epoch": 0.8189335614485315, "grad_norm": 1.065455590155432, "learning_rate": 8.357687016504972e-07, "loss": 0.0256, "step": 2872 }, { "epoch": 0.8192187054462503, "grad_norm": 1.5358434738505333, "learning_rate": 8.332140462189841e-07, "loss": 0.0439, "step": 2873 }, { "epoch": 0.8195038494439693, "grad_norm": 1.4294989218591867, "learning_rate": 8.306629461807109e-07, "loss": 0.0286, "step": 2874 }, { "epoch": 0.8197889934416881, "grad_norm": 1.9130676592818923, "learning_rate": 8.281154037124523e-07, "loss": 0.0196, "step": 2875 }, { "epoch": 0.8200741374394069, "grad_norm": 1.406613009250302, "learning_rate": 8.255714209879506e-07, "loss": 0.0262, "step": 2876 }, { "epoch": 0.8203592814371258, "grad_norm": 0.6268363665660219, "learning_rate": 8.230310001779096e-07, "loss": 0.0121, "step": 2877 }, { "epoch": 0.8206444254348446, "grad_norm": 0.7357882911831951, "learning_rate": 8.204941434499941e-07, "loss": 0.0099, "step": 2878 }, { "epoch": 0.8209295694325635, "grad_norm": 1.8175431785227907, "learning_rate": 8.179608529688276e-07, "loss": 0.0249, "step": 2879 }, { "epoch": 0.8212147134302823, "grad_norm": 0.2394768774671226, "learning_rate": 8.154311308959911e-07, "loss": 0.0064, "step": 2880 }, { "epoch": 0.8214998574280011, "grad_norm": 1.0082037597372302, "learning_rate": 8.129049793900185e-07, "loss": 0.0152, "step": 2881 }, { "epoch": 0.82178500142572, "grad_norm": 0.9206006167023392, "learning_rate": 8.103824006064032e-07, "loss": 0.0267, "step": 2882 }, { "epoch": 0.8220701454234388, "grad_norm": 1.184257876582493, "learning_rate": 8.078633966975818e-07, "loss": 0.0344, "step": 2883 }, { "epoch": 0.8223552894211577, "grad_norm": 1.119928137630302, "learning_rate": 8.053479698129463e-07, "loss": 0.021, "step": 2884 }, { "epoch": 0.8226404334188765, "grad_norm": 0.7752796412552893, "learning_rate": 8.028361220988334e-07, "loss": 0.0217, "step": 2885 }, { "epoch": 0.8229255774165953, "grad_norm": 1.0723505832855476, "learning_rate": 8.003278556985295e-07, "loss": 0.0404, "step": 2886 }, { "epoch": 0.8232107214143143, "grad_norm": 0.370869296241954, "learning_rate": 7.978231727522634e-07, "loss": 0.0091, "step": 2887 }, { "epoch": 0.8234958654120331, "grad_norm": 1.6501205644493508, "learning_rate": 7.953220753972029e-07, "loss": 0.0302, "step": 2888 }, { "epoch": 0.823781009409752, "grad_norm": 3.387858235076237, "learning_rate": 7.928245657674599e-07, "loss": 0.0449, "step": 2889 }, { "epoch": 0.8240661534074708, "grad_norm": 0.7939624399299788, "learning_rate": 7.903306459940863e-07, "loss": 0.0149, "step": 2890 }, { "epoch": 0.8243512974051896, "grad_norm": 0.8919761379481936, "learning_rate": 7.87840318205067e-07, "loss": 0.0121, "step": 2891 }, { "epoch": 0.8246364414029085, "grad_norm": 0.8945707327162529, "learning_rate": 7.853535845253252e-07, "loss": 0.0167, "step": 2892 }, { "epoch": 0.8249215854006273, "grad_norm": 1.4960085975377995, "learning_rate": 7.828704470767151e-07, "loss": 0.0239, "step": 2893 }, { "epoch": 0.8252067293983462, "grad_norm": 1.7361926971910235, "learning_rate": 7.803909079780237e-07, "loss": 0.0196, "step": 2894 }, { "epoch": 0.825491873396065, "grad_norm": 2.1069882551766312, "learning_rate": 7.779149693449666e-07, "loss": 0.0331, "step": 2895 }, { "epoch": 0.8257770173937838, "grad_norm": 1.640025015859863, "learning_rate": 7.754426332901888e-07, "loss": 0.0393, "step": 2896 }, { "epoch": 0.8260621613915027, "grad_norm": 0.6030323002263968, "learning_rate": 7.729739019232579e-07, "loss": 0.0108, "step": 2897 }, { "epoch": 0.8263473053892215, "grad_norm": 0.526741493347755, "learning_rate": 7.705087773506731e-07, "loss": 0.016, "step": 2898 }, { "epoch": 0.8266324493869404, "grad_norm": 1.5742017652828122, "learning_rate": 7.680472616758467e-07, "loss": 0.0298, "step": 2899 }, { "epoch": 0.8269175933846592, "grad_norm": 0.934331197910886, "learning_rate": 7.655893569991175e-07, "loss": 0.0105, "step": 2900 }, { "epoch": 0.8272027373823782, "grad_norm": 0.7000170882506863, "learning_rate": 7.631350654177405e-07, "loss": 0.0158, "step": 2901 }, { "epoch": 0.827487881380097, "grad_norm": 0.9680259614112673, "learning_rate": 7.606843890258914e-07, "loss": 0.0239, "step": 2902 }, { "epoch": 0.8277730253778158, "grad_norm": 1.494423198330321, "learning_rate": 7.582373299146578e-07, "loss": 0.0392, "step": 2903 }, { "epoch": 0.8280581693755347, "grad_norm": 1.207768362788276, "learning_rate": 7.557938901720418e-07, "loss": 0.0151, "step": 2904 }, { "epoch": 0.8283433133732535, "grad_norm": 1.1032689271047176, "learning_rate": 7.533540718829547e-07, "loss": 0.0482, "step": 2905 }, { "epoch": 0.8286284573709723, "grad_norm": 1.376001495335339, "learning_rate": 7.509178771292231e-07, "loss": 0.0188, "step": 2906 }, { "epoch": 0.8289136013686912, "grad_norm": 1.0040653742355359, "learning_rate": 7.484853079895782e-07, "loss": 0.014, "step": 2907 }, { "epoch": 0.82919874536641, "grad_norm": 0.853257302586088, "learning_rate": 7.460563665396569e-07, "loss": 0.019, "step": 2908 }, { "epoch": 0.8294838893641289, "grad_norm": 0.41199696502392325, "learning_rate": 7.436310548520037e-07, "loss": 0.0078, "step": 2909 }, { "epoch": 0.8297690333618477, "grad_norm": 1.344739178123292, "learning_rate": 7.412093749960625e-07, "loss": 0.0381, "step": 2910 }, { "epoch": 0.8300541773595665, "grad_norm": 1.4175396586193671, "learning_rate": 7.38791329038181e-07, "loss": 0.0152, "step": 2911 }, { "epoch": 0.8303393213572854, "grad_norm": 1.8019194230806457, "learning_rate": 7.363769190416048e-07, "loss": 0.0206, "step": 2912 }, { "epoch": 0.8306244653550042, "grad_norm": 0.9576926416107516, "learning_rate": 7.339661470664761e-07, "loss": 0.0179, "step": 2913 }, { "epoch": 0.8309096093527232, "grad_norm": 1.1324523780149793, "learning_rate": 7.315590151698371e-07, "loss": 0.0238, "step": 2914 }, { "epoch": 0.831194753350442, "grad_norm": 1.4722177708236213, "learning_rate": 7.291555254056198e-07, "loss": 0.0186, "step": 2915 }, { "epoch": 0.8314798973481609, "grad_norm": 0.9504375580728752, "learning_rate": 7.267556798246489e-07, "loss": 0.0433, "step": 2916 }, { "epoch": 0.8317650413458797, "grad_norm": 1.3361157091698659, "learning_rate": 7.243594804746401e-07, "loss": 0.0271, "step": 2917 }, { "epoch": 0.8320501853435985, "grad_norm": 1.5034432617659543, "learning_rate": 7.219669294002002e-07, "loss": 0.0161, "step": 2918 }, { "epoch": 0.8323353293413174, "grad_norm": 0.8175025379771647, "learning_rate": 7.195780286428206e-07, "loss": 0.0126, "step": 2919 }, { "epoch": 0.8326204733390362, "grad_norm": 0.8373585191851456, "learning_rate": 7.17192780240879e-07, "loss": 0.0164, "step": 2920 }, { "epoch": 0.832905617336755, "grad_norm": 0.4003185300249988, "learning_rate": 7.148111862296331e-07, "loss": 0.0117, "step": 2921 }, { "epoch": 0.8331907613344739, "grad_norm": 1.5025437383078868, "learning_rate": 7.124332486412289e-07, "loss": 0.0204, "step": 2922 }, { "epoch": 0.8334759053321927, "grad_norm": 1.3047499112241643, "learning_rate": 7.100589695046883e-07, "loss": 0.022, "step": 2923 }, { "epoch": 0.8337610493299116, "grad_norm": 0.568459371870428, "learning_rate": 7.076883508459115e-07, "loss": 0.0078, "step": 2924 }, { "epoch": 0.8340461933276304, "grad_norm": 2.1528288915735145, "learning_rate": 7.053213946876769e-07, "loss": 0.0592, "step": 2925 }, { "epoch": 0.8343313373253493, "grad_norm": 0.7861846035271016, "learning_rate": 7.029581030496368e-07, "loss": 0.025, "step": 2926 }, { "epoch": 0.8346164813230682, "grad_norm": 2.3660338087684987, "learning_rate": 7.005984779483166e-07, "loss": 0.0348, "step": 2927 }, { "epoch": 0.834901625320787, "grad_norm": 1.2003912939811823, "learning_rate": 6.982425213971145e-07, "loss": 0.0263, "step": 2928 }, { "epoch": 0.8351867693185059, "grad_norm": 0.80495647903881, "learning_rate": 6.958902354062952e-07, "loss": 0.0153, "step": 2929 }, { "epoch": 0.8354719133162247, "grad_norm": 0.9565190013742579, "learning_rate": 6.93541621982996e-07, "loss": 0.0163, "step": 2930 }, { "epoch": 0.8357570573139436, "grad_norm": 1.5292130072943486, "learning_rate": 6.911966831312189e-07, "loss": 0.0256, "step": 2931 }, { "epoch": 0.8360422013116624, "grad_norm": 1.3532017392811009, "learning_rate": 6.888554208518272e-07, "loss": 0.019, "step": 2932 }, { "epoch": 0.8363273453093812, "grad_norm": 0.9094015042109889, "learning_rate": 6.86517837142549e-07, "loss": 0.015, "step": 2933 }, { "epoch": 0.8366124893071001, "grad_norm": 0.7082302676502451, "learning_rate": 6.841839339979778e-07, "loss": 0.0187, "step": 2934 }, { "epoch": 0.8368976333048189, "grad_norm": 0.9879617552325342, "learning_rate": 6.818537134095604e-07, "loss": 0.0201, "step": 2935 }, { "epoch": 0.8371827773025378, "grad_norm": 0.564486638835549, "learning_rate": 6.795271773656054e-07, "loss": 0.0208, "step": 2936 }, { "epoch": 0.8374679213002566, "grad_norm": 0.2814308779842693, "learning_rate": 6.772043278512747e-07, "loss": 0.0068, "step": 2937 }, { "epoch": 0.8377530652979754, "grad_norm": 0.7178043054217914, "learning_rate": 6.748851668485873e-07, "loss": 0.0098, "step": 2938 }, { "epoch": 0.8380382092956943, "grad_norm": 0.6805116484205672, "learning_rate": 6.725696963364126e-07, "loss": 0.0081, "step": 2939 }, { "epoch": 0.8383233532934131, "grad_norm": 1.0022569621899031, "learning_rate": 6.702579182904723e-07, "loss": 0.0264, "step": 2940 }, { "epoch": 0.8386084972911321, "grad_norm": 1.904193877543303, "learning_rate": 6.679498346833374e-07, "loss": 0.042, "step": 2941 }, { "epoch": 0.8388936412888509, "grad_norm": 0.29653976115050246, "learning_rate": 6.656454474844248e-07, "loss": 0.0059, "step": 2942 }, { "epoch": 0.8391787852865698, "grad_norm": 0.6705392516340656, "learning_rate": 6.633447586600028e-07, "loss": 0.0118, "step": 2943 }, { "epoch": 0.8394639292842886, "grad_norm": 0.4684828232836064, "learning_rate": 6.61047770173176e-07, "loss": 0.0083, "step": 2944 }, { "epoch": 0.8397490732820074, "grad_norm": 0.6894065679425363, "learning_rate": 6.587544839838961e-07, "loss": 0.0139, "step": 2945 }, { "epoch": 0.8400342172797263, "grad_norm": 1.1770716615384926, "learning_rate": 6.564649020489566e-07, "loss": 0.0188, "step": 2946 }, { "epoch": 0.8403193612774451, "grad_norm": 0.8594451557210869, "learning_rate": 6.541790263219894e-07, "loss": 0.0171, "step": 2947 }, { "epoch": 0.840604505275164, "grad_norm": 0.9031875486057582, "learning_rate": 6.518968587534635e-07, "loss": 0.025, "step": 2948 }, { "epoch": 0.8408896492728828, "grad_norm": 1.9432753386854618, "learning_rate": 6.496184012906814e-07, "loss": 0.0232, "step": 2949 }, { "epoch": 0.8411747932706016, "grad_norm": 0.7020884836813761, "learning_rate": 6.473436558777846e-07, "loss": 0.0086, "step": 2950 }, { "epoch": 0.8414599372683205, "grad_norm": 0.5466929368926949, "learning_rate": 6.450726244557448e-07, "loss": 0.012, "step": 2951 }, { "epoch": 0.8417450812660393, "grad_norm": 1.7507269669821028, "learning_rate": 6.428053089623648e-07, "loss": 0.0178, "step": 2952 }, { "epoch": 0.8420302252637581, "grad_norm": 1.4983504947040904, "learning_rate": 6.405417113322765e-07, "loss": 0.0259, "step": 2953 }, { "epoch": 0.8423153692614771, "grad_norm": 0.6649231360890827, "learning_rate": 6.382818334969399e-07, "loss": 0.0152, "step": 2954 }, { "epoch": 0.8426005132591959, "grad_norm": 0.5291504974490259, "learning_rate": 6.360256773846402e-07, "loss": 0.0093, "step": 2955 }, { "epoch": 0.8428856572569148, "grad_norm": 1.0922105163879545, "learning_rate": 6.337732449204886e-07, "loss": 0.0149, "step": 2956 }, { "epoch": 0.8431708012546336, "grad_norm": 1.0162775087631601, "learning_rate": 6.315245380264179e-07, "loss": 0.0238, "step": 2957 }, { "epoch": 0.8434559452523525, "grad_norm": 1.035067751129879, "learning_rate": 6.292795586211803e-07, "loss": 0.0184, "step": 2958 }, { "epoch": 0.8437410892500713, "grad_norm": 0.4314581076678392, "learning_rate": 6.27038308620353e-07, "loss": 0.0069, "step": 2959 }, { "epoch": 0.8440262332477901, "grad_norm": 0.9172564461045714, "learning_rate": 6.24800789936324e-07, "loss": 0.013, "step": 2960 }, { "epoch": 0.844311377245509, "grad_norm": 1.8064356661478254, "learning_rate": 6.225670044783011e-07, "loss": 0.0285, "step": 2961 }, { "epoch": 0.8445965212432278, "grad_norm": 0.9846608581771469, "learning_rate": 6.203369541523075e-07, "loss": 0.0181, "step": 2962 }, { "epoch": 0.8448816652409467, "grad_norm": 0.5341057881948537, "learning_rate": 6.181106408611781e-07, "loss": 0.0092, "step": 2963 }, { "epoch": 0.8451668092386655, "grad_norm": 0.5404376217996557, "learning_rate": 6.158880665045586e-07, "loss": 0.0104, "step": 2964 }, { "epoch": 0.8454519532363843, "grad_norm": 0.5547122276300936, "learning_rate": 6.136692329789046e-07, "loss": 0.0138, "step": 2965 }, { "epoch": 0.8457370972341032, "grad_norm": 0.870237694120721, "learning_rate": 6.114541421774811e-07, "loss": 0.0194, "step": 2966 }, { "epoch": 0.8460222412318221, "grad_norm": 1.0347448612702808, "learning_rate": 6.092427959903574e-07, "loss": 0.0283, "step": 2967 }, { "epoch": 0.846307385229541, "grad_norm": 1.4250265517501355, "learning_rate": 6.070351963044091e-07, "loss": 0.0508, "step": 2968 }, { "epoch": 0.8465925292272598, "grad_norm": 0.34595863279716216, "learning_rate": 6.04831345003315e-07, "loss": 0.0062, "step": 2969 }, { "epoch": 0.8468776732249786, "grad_norm": 1.2934086285223172, "learning_rate": 6.026312439675553e-07, "loss": 0.0239, "step": 2970 }, { "epoch": 0.8471628172226975, "grad_norm": 1.1738506805733755, "learning_rate": 6.004348950744094e-07, "loss": 0.0134, "step": 2971 }, { "epoch": 0.8474479612204163, "grad_norm": 0.9407356078115766, "learning_rate": 5.982423001979559e-07, "loss": 0.0395, "step": 2972 }, { "epoch": 0.8477331052181352, "grad_norm": 1.899663507801717, "learning_rate": 5.960534612090707e-07, "loss": 0.029, "step": 2973 }, { "epoch": 0.848018249215854, "grad_norm": 1.635086087665949, "learning_rate": 5.93868379975423e-07, "loss": 0.0204, "step": 2974 }, { "epoch": 0.8483033932135728, "grad_norm": 2.7131462813485427, "learning_rate": 5.916870583614792e-07, "loss": 0.0511, "step": 2975 }, { "epoch": 0.8485885372112917, "grad_norm": 0.6000044831511491, "learning_rate": 5.895094982284949e-07, "loss": 0.0151, "step": 2976 }, { "epoch": 0.8488736812090105, "grad_norm": 0.8428158736884612, "learning_rate": 5.873357014345143e-07, "loss": 0.0275, "step": 2977 }, { "epoch": 0.8491588252067294, "grad_norm": 1.1200799726027786, "learning_rate": 5.851656698343761e-07, "loss": 0.0311, "step": 2978 }, { "epoch": 0.8494439692044482, "grad_norm": 0.7513879350557346, "learning_rate": 5.829994052797011e-07, "loss": 0.0189, "step": 2979 }, { "epoch": 0.849729113202167, "grad_norm": 0.5097265708096947, "learning_rate": 5.808369096188981e-07, "loss": 0.0101, "step": 2980 }, { "epoch": 0.850014257199886, "grad_norm": 1.5460511983229916, "learning_rate": 5.786781846971601e-07, "loss": 0.0304, "step": 2981 }, { "epoch": 0.8502994011976048, "grad_norm": 1.677115356095477, "learning_rate": 5.765232323564617e-07, "loss": 0.0635, "step": 2982 }, { "epoch": 0.8505845451953237, "grad_norm": 0.880229931432272, "learning_rate": 5.743720544355597e-07, "loss": 0.0363, "step": 2983 }, { "epoch": 0.8508696891930425, "grad_norm": 0.5325984514944866, "learning_rate": 5.722246527699887e-07, "loss": 0.0049, "step": 2984 }, { "epoch": 0.8511548331907614, "grad_norm": 1.031023684922083, "learning_rate": 5.700810291920628e-07, "loss": 0.0192, "step": 2985 }, { "epoch": 0.8514399771884802, "grad_norm": 0.8501593340127255, "learning_rate": 5.679411855308697e-07, "loss": 0.0173, "step": 2986 }, { "epoch": 0.851725121186199, "grad_norm": 1.2342740400840233, "learning_rate": 5.658051236122774e-07, "loss": 0.023, "step": 2987 }, { "epoch": 0.8520102651839179, "grad_norm": 1.060959958106904, "learning_rate": 5.636728452589196e-07, "loss": 0.0154, "step": 2988 }, { "epoch": 0.8522954091816367, "grad_norm": 1.2602365805850364, "learning_rate": 5.615443522902076e-07, "loss": 0.0231, "step": 2989 }, { "epoch": 0.8525805531793555, "grad_norm": 0.764129932845986, "learning_rate": 5.594196465223184e-07, "loss": 0.0094, "step": 2990 }, { "epoch": 0.8528656971770744, "grad_norm": 1.5099273606359904, "learning_rate": 5.57298729768202e-07, "loss": 0.0173, "step": 2991 }, { "epoch": 0.8531508411747932, "grad_norm": 0.9820827550126608, "learning_rate": 5.551816038375729e-07, "loss": 0.0134, "step": 2992 }, { "epoch": 0.8534359851725121, "grad_norm": 1.1727880595649633, "learning_rate": 5.530682705369084e-07, "loss": 0.0258, "step": 2993 }, { "epoch": 0.853721129170231, "grad_norm": 1.0065318180810632, "learning_rate": 5.509587316694537e-07, "loss": 0.0207, "step": 2994 }, { "epoch": 0.8540062731679499, "grad_norm": 0.7848127018539507, "learning_rate": 5.488529890352157e-07, "loss": 0.0083, "step": 2995 }, { "epoch": 0.8542914171656687, "grad_norm": 1.0156083213820275, "learning_rate": 5.467510444309609e-07, "loss": 0.0119, "step": 2996 }, { "epoch": 0.8545765611633875, "grad_norm": 1.0197812302503872, "learning_rate": 5.446528996502149e-07, "loss": 0.0128, "step": 2997 }, { "epoch": 0.8548617051611064, "grad_norm": 0.8285221051674888, "learning_rate": 5.425585564832625e-07, "loss": 0.0093, "step": 2998 }, { "epoch": 0.8551468491588252, "grad_norm": 0.8350102395268476, "learning_rate": 5.404680167171427e-07, "loss": 0.0209, "step": 2999 }, { "epoch": 0.8554319931565441, "grad_norm": 0.8296267711722471, "learning_rate": 5.38381282135651e-07, "loss": 0.0103, "step": 3000 }, { "epoch": 0.8557171371542629, "grad_norm": 1.1354258345442707, "learning_rate": 5.362983545193351e-07, "loss": 0.0132, "step": 3001 }, { "epoch": 0.8560022811519817, "grad_norm": 1.3807885834714135, "learning_rate": 5.34219235645494e-07, "loss": 0.0184, "step": 3002 }, { "epoch": 0.8562874251497006, "grad_norm": 0.4618296825629643, "learning_rate": 5.321439272881795e-07, "loss": 0.0101, "step": 3003 }, { "epoch": 0.8565725691474194, "grad_norm": 0.7977559295975786, "learning_rate": 5.300724312181876e-07, "loss": 0.0149, "step": 3004 }, { "epoch": 0.8568577131451383, "grad_norm": 0.5816427329433224, "learning_rate": 5.280047492030638e-07, "loss": 0.0177, "step": 3005 }, { "epoch": 0.8571428571428571, "grad_norm": 0.47258730273410254, "learning_rate": 5.259408830070989e-07, "loss": 0.0071, "step": 3006 }, { "epoch": 0.857428001140576, "grad_norm": 0.7070970759261074, "learning_rate": 5.238808343913299e-07, "loss": 0.0247, "step": 3007 }, { "epoch": 0.8577131451382949, "grad_norm": 0.6869352056980774, "learning_rate": 5.218246051135323e-07, "loss": 0.0137, "step": 3008 }, { "epoch": 0.8579982891360137, "grad_norm": 1.0038883944832988, "learning_rate": 5.197721969282271e-07, "loss": 0.0257, "step": 3009 }, { "epoch": 0.8582834331337326, "grad_norm": 1.2068882204622162, "learning_rate": 5.177236115866685e-07, "loss": 0.0209, "step": 3010 }, { "epoch": 0.8585685771314514, "grad_norm": 0.9311944031546174, "learning_rate": 5.156788508368565e-07, "loss": 0.0231, "step": 3011 }, { "epoch": 0.8588537211291702, "grad_norm": 1.5397233224377365, "learning_rate": 5.136379164235222e-07, "loss": 0.0241, "step": 3012 }, { "epoch": 0.8591388651268891, "grad_norm": 1.6349055154960528, "learning_rate": 5.116008100881348e-07, "loss": 0.0272, "step": 3013 }, { "epoch": 0.8594240091246079, "grad_norm": 1.4003109679024432, "learning_rate": 5.095675335688949e-07, "loss": 0.0391, "step": 3014 }, { "epoch": 0.8597091531223268, "grad_norm": 0.6121106764703516, "learning_rate": 5.075380886007369e-07, "loss": 0.0158, "step": 3015 }, { "epoch": 0.8599942971200456, "grad_norm": 1.343535205043131, "learning_rate": 5.055124769153247e-07, "loss": 0.0401, "step": 3016 }, { "epoch": 0.8602794411177644, "grad_norm": 0.3419700286934388, "learning_rate": 5.034907002410517e-07, "loss": 0.0048, "step": 3017 }, { "epoch": 0.8605645851154833, "grad_norm": 0.9895620726753768, "learning_rate": 5.014727603030389e-07, "loss": 0.0221, "step": 3018 }, { "epoch": 0.8608497291132021, "grad_norm": 1.3614488033863745, "learning_rate": 4.994586588231354e-07, "loss": 0.0183, "step": 3019 }, { "epoch": 0.861134873110921, "grad_norm": 1.7974814733359572, "learning_rate": 4.974483975199135e-07, "loss": 0.053, "step": 3020 }, { "epoch": 0.8614200171086399, "grad_norm": 0.6222206952249488, "learning_rate": 4.954419781086672e-07, "loss": 0.0126, "step": 3021 }, { "epoch": 0.8617051611063588, "grad_norm": 0.5890050784706143, "learning_rate": 4.934394023014133e-07, "loss": 0.0097, "step": 3022 }, { "epoch": 0.8619903051040776, "grad_norm": 0.2498934201504258, "learning_rate": 4.914406718068914e-07, "loss": 0.0037, "step": 3023 }, { "epoch": 0.8622754491017964, "grad_norm": 1.370346135075879, "learning_rate": 4.894457883305576e-07, "loss": 0.0189, "step": 3024 }, { "epoch": 0.8625605930995153, "grad_norm": 0.35639843396867693, "learning_rate": 4.874547535745872e-07, "loss": 0.0055, "step": 3025 }, { "epoch": 0.8628457370972341, "grad_norm": 1.6416261884833374, "learning_rate": 4.854675692378669e-07, "loss": 0.0296, "step": 3026 }, { "epoch": 0.863130881094953, "grad_norm": 1.1136701347202882, "learning_rate": 4.834842370160048e-07, "loss": 0.0164, "step": 3027 }, { "epoch": 0.8634160250926718, "grad_norm": 1.368275106996081, "learning_rate": 4.815047586013172e-07, "loss": 0.0336, "step": 3028 }, { "epoch": 0.8637011690903906, "grad_norm": 1.2575862109268776, "learning_rate": 4.795291356828335e-07, "loss": 0.0228, "step": 3029 }, { "epoch": 0.8639863130881095, "grad_norm": 0.6138345409799086, "learning_rate": 4.775573699462926e-07, "loss": 0.0098, "step": 3030 }, { "epoch": 0.8642714570858283, "grad_norm": 2.4032353134017397, "learning_rate": 4.7558946307414645e-07, "loss": 0.0509, "step": 3031 }, { "epoch": 0.8645566010835471, "grad_norm": 1.3870212191516793, "learning_rate": 4.736254167455473e-07, "loss": 0.0358, "step": 3032 }, { "epoch": 0.864841745081266, "grad_norm": 1.4321688335485183, "learning_rate": 4.7166523263635743e-07, "loss": 0.0353, "step": 3033 }, { "epoch": 0.8651268890789849, "grad_norm": 0.452148177107619, "learning_rate": 4.697089124191434e-07, "loss": 0.0071, "step": 3034 }, { "epoch": 0.8654120330767038, "grad_norm": 2.022511254722525, "learning_rate": 4.677564577631749e-07, "loss": 0.0269, "step": 3035 }, { "epoch": 0.8656971770744226, "grad_norm": 1.1264957765805845, "learning_rate": 4.6580787033442376e-07, "loss": 0.0252, "step": 3036 }, { "epoch": 0.8659823210721415, "grad_norm": 0.9989388112218847, "learning_rate": 4.6386315179555883e-07, "loss": 0.0241, "step": 3037 }, { "epoch": 0.8662674650698603, "grad_norm": 1.9978868244980388, "learning_rate": 4.6192230380595004e-07, "loss": 0.0216, "step": 3038 }, { "epoch": 0.8665526090675791, "grad_norm": 0.4630351383686262, "learning_rate": 4.599853280216665e-07, "loss": 0.0103, "step": 3039 }, { "epoch": 0.866837753065298, "grad_norm": 0.7910478687505361, "learning_rate": 4.580522260954706e-07, "loss": 0.0133, "step": 3040 }, { "epoch": 0.8671228970630168, "grad_norm": 0.905830864917694, "learning_rate": 4.561229996768196e-07, "loss": 0.0294, "step": 3041 }, { "epoch": 0.8674080410607357, "grad_norm": 0.8213970821241439, "learning_rate": 4.5419765041186556e-07, "loss": 0.0219, "step": 3042 }, { "epoch": 0.8676931850584545, "grad_norm": 1.1748421184539621, "learning_rate": 4.5227617994345053e-07, "loss": 0.0163, "step": 3043 }, { "epoch": 0.8679783290561733, "grad_norm": 0.43672133470671437, "learning_rate": 4.503585899111068e-07, "loss": 0.007, "step": 3044 }, { "epoch": 0.8682634730538922, "grad_norm": 2.0611265738885183, "learning_rate": 4.4844488195105784e-07, "loss": 0.048, "step": 3045 }, { "epoch": 0.868548617051611, "grad_norm": 1.3462808320153008, "learning_rate": 4.4653505769621073e-07, "loss": 0.0322, "step": 3046 }, { "epoch": 0.86883376104933, "grad_norm": 1.1019872968537088, "learning_rate": 4.446291187761648e-07, "loss": 0.0257, "step": 3047 }, { "epoch": 0.8691189050470488, "grad_norm": 0.4768401486598245, "learning_rate": 4.4272706681719737e-07, "loss": 0.0138, "step": 3048 }, { "epoch": 0.8694040490447676, "grad_norm": 1.2680969131683553, "learning_rate": 4.408289034422736e-07, "loss": 0.035, "step": 3049 }, { "epoch": 0.8696891930424865, "grad_norm": 1.0552008198715335, "learning_rate": 4.3893463027103735e-07, "loss": 0.0191, "step": 3050 }, { "epoch": 0.8699743370402053, "grad_norm": 0.7865993113772903, "learning_rate": 4.370442489198179e-07, "loss": 0.017, "step": 3051 }, { "epoch": 0.8702594810379242, "grad_norm": 0.7490127620354204, "learning_rate": 4.351577610016189e-07, "loss": 0.0132, "step": 3052 }, { "epoch": 0.870544625035643, "grad_norm": 0.5028891048262714, "learning_rate": 4.3327516812612545e-07, "loss": 0.0068, "step": 3053 }, { "epoch": 0.8708297690333618, "grad_norm": 0.7876344772007207, "learning_rate": 4.313964718996949e-07, "loss": 0.0195, "step": 3054 }, { "epoch": 0.8711149130310807, "grad_norm": 0.46302658918296347, "learning_rate": 4.2952167392536436e-07, "loss": 0.0081, "step": 3055 }, { "epoch": 0.8714000570287995, "grad_norm": 0.5289145118332526, "learning_rate": 4.2765077580284197e-07, "loss": 0.0065, "step": 3056 }, { "epoch": 0.8716852010265184, "grad_norm": 1.5234422334108315, "learning_rate": 4.257837791285091e-07, "loss": 0.016, "step": 3057 }, { "epoch": 0.8719703450242372, "grad_norm": 1.6244487998081059, "learning_rate": 4.2392068549541755e-07, "loss": 0.0346, "step": 3058 }, { "epoch": 0.872255489021956, "grad_norm": 1.141957719991174, "learning_rate": 4.22061496493289e-07, "loss": 0.0178, "step": 3059 }, { "epoch": 0.8725406330196749, "grad_norm": 1.6278771393473568, "learning_rate": 4.202062137085139e-07, "loss": 0.0276, "step": 3060 }, { "epoch": 0.8728257770173938, "grad_norm": 0.514051654602654, "learning_rate": 4.183548387241498e-07, "loss": 0.0086, "step": 3061 }, { "epoch": 0.8731109210151127, "grad_norm": 0.8895694324937317, "learning_rate": 4.1650737311991765e-07, "loss": 0.0163, "step": 3062 }, { "epoch": 0.8733960650128315, "grad_norm": 1.7832475192370916, "learning_rate": 4.146638184722057e-07, "loss": 0.0443, "step": 3063 }, { "epoch": 0.8736812090105504, "grad_norm": 1.1145459448952413, "learning_rate": 4.1282417635406525e-07, "loss": 0.0241, "step": 3064 }, { "epoch": 0.8739663530082692, "grad_norm": 1.3530250524727738, "learning_rate": 4.109884483352045e-07, "loss": 0.0204, "step": 3065 }, { "epoch": 0.874251497005988, "grad_norm": 1.0947124737811844, "learning_rate": 4.091566359819954e-07, "loss": 0.0418, "step": 3066 }, { "epoch": 0.8745366410037069, "grad_norm": 0.9920153082074292, "learning_rate": 4.0732874085747053e-07, "loss": 0.0183, "step": 3067 }, { "epoch": 0.8748217850014257, "grad_norm": 1.065157293408351, "learning_rate": 4.055047645213167e-07, "loss": 0.0162, "step": 3068 }, { "epoch": 0.8751069289991446, "grad_norm": 0.8965691490104757, "learning_rate": 4.0368470852987893e-07, "loss": 0.024, "step": 3069 }, { "epoch": 0.8753920729968634, "grad_norm": 2.346622705630544, "learning_rate": 4.018685744361539e-07, "loss": 0.0497, "step": 3070 }, { "epoch": 0.8756772169945822, "grad_norm": 0.9532499289760871, "learning_rate": 4.000563637897964e-07, "loss": 0.0121, "step": 3071 }, { "epoch": 0.8759623609923011, "grad_norm": 0.4104488966130429, "learning_rate": 3.982480781371106e-07, "loss": 0.0082, "step": 3072 }, { "epoch": 0.8762475049900199, "grad_norm": 1.5437624866770545, "learning_rate": 3.9644371902105296e-07, "loss": 0.0588, "step": 3073 }, { "epoch": 0.8765326489877389, "grad_norm": 1.0598825987060185, "learning_rate": 3.9464328798122843e-07, "loss": 0.011, "step": 3074 }, { "epoch": 0.8768177929854577, "grad_norm": 0.7745631171488191, "learning_rate": 3.928467865538904e-07, "loss": 0.0277, "step": 3075 }, { "epoch": 0.8771029369831765, "grad_norm": 1.1461659365443477, "learning_rate": 3.910542162719394e-07, "loss": 0.0379, "step": 3076 }, { "epoch": 0.8773880809808954, "grad_norm": 1.0689248460984624, "learning_rate": 3.8926557866492297e-07, "loss": 0.0189, "step": 3077 }, { "epoch": 0.8776732249786142, "grad_norm": 1.7797425568134084, "learning_rate": 3.874808752590298e-07, "loss": 0.0512, "step": 3078 }, { "epoch": 0.8779583689763331, "grad_norm": 1.313282350888063, "learning_rate": 3.8570010757709555e-07, "loss": 0.0285, "step": 3079 }, { "epoch": 0.8782435129740519, "grad_norm": 1.3537466884000247, "learning_rate": 3.8392327713859644e-07, "loss": 0.0192, "step": 3080 }, { "epoch": 0.8785286569717707, "grad_norm": 1.1739115495562644, "learning_rate": 3.821503854596459e-07, "loss": 0.0154, "step": 3081 }, { "epoch": 0.8788138009694896, "grad_norm": 0.6978895204370438, "learning_rate": 3.803814340529999e-07, "loss": 0.0205, "step": 3082 }, { "epoch": 0.8790989449672084, "grad_norm": 2.472278902571292, "learning_rate": 3.786164244280532e-07, "loss": 0.0816, "step": 3083 }, { "epoch": 0.8793840889649273, "grad_norm": 0.42124346849130617, "learning_rate": 3.7685535809083406e-07, "loss": 0.0061, "step": 3084 }, { "epoch": 0.8796692329626461, "grad_norm": 0.641144962594694, "learning_rate": 3.7509823654400757e-07, "loss": 0.0065, "step": 3085 }, { "epoch": 0.8799543769603649, "grad_norm": 0.9745992756194799, "learning_rate": 3.7334506128687277e-07, "loss": 0.0127, "step": 3086 }, { "epoch": 0.8802395209580839, "grad_norm": 1.0045168845028045, "learning_rate": 3.715958338153619e-07, "loss": 0.0247, "step": 3087 }, { "epoch": 0.8805246649558027, "grad_norm": 0.49384156816707714, "learning_rate": 3.698505556220372e-07, "loss": 0.0134, "step": 3088 }, { "epoch": 0.8808098089535216, "grad_norm": 1.153521953839027, "learning_rate": 3.681092281960935e-07, "loss": 0.0098, "step": 3089 }, { "epoch": 0.8810949529512404, "grad_norm": 1.7322788165810779, "learning_rate": 3.6637185302335234e-07, "loss": 0.035, "step": 3090 }, { "epoch": 0.8813800969489592, "grad_norm": 1.129612121303509, "learning_rate": 3.646384315862633e-07, "loss": 0.0126, "step": 3091 }, { "epoch": 0.8816652409466781, "grad_norm": 0.602950178845579, "learning_rate": 3.629089653639034e-07, "loss": 0.0165, "step": 3092 }, { "epoch": 0.8819503849443969, "grad_norm": 0.6273478691828218, "learning_rate": 3.6118345583197434e-07, "loss": 0.0087, "step": 3093 }, { "epoch": 0.8822355289421158, "grad_norm": 0.7988486703644966, "learning_rate": 3.594619044628017e-07, "loss": 0.0235, "step": 3094 }, { "epoch": 0.8825206729398346, "grad_norm": 1.1404611832652243, "learning_rate": 3.577443127253316e-07, "loss": 0.0208, "step": 3095 }, { "epoch": 0.8828058169375534, "grad_norm": 0.673777489436252, "learning_rate": 3.5603068208513616e-07, "loss": 0.0139, "step": 3096 }, { "epoch": 0.8830909609352723, "grad_norm": 1.0996498578604454, "learning_rate": 3.5432101400440456e-07, "loss": 0.0188, "step": 3097 }, { "epoch": 0.8833761049329911, "grad_norm": 1.7534824016132662, "learning_rate": 3.526153099419427e-07, "loss": 0.0332, "step": 3098 }, { "epoch": 0.88366124893071, "grad_norm": 1.5098939443616157, "learning_rate": 3.5091357135317917e-07, "loss": 0.0269, "step": 3099 }, { "epoch": 0.8839463929284288, "grad_norm": 0.8049820887786194, "learning_rate": 3.492157996901552e-07, "loss": 0.0082, "step": 3100 }, { "epoch": 0.8842315369261478, "grad_norm": 0.9473485933582722, "learning_rate": 3.4752199640152916e-07, "loss": 0.0215, "step": 3101 }, { "epoch": 0.8845166809238666, "grad_norm": 1.2883061493145709, "learning_rate": 3.458321629325717e-07, "loss": 0.0306, "step": 3102 }, { "epoch": 0.8848018249215854, "grad_norm": 0.9187716869904959, "learning_rate": 3.441463007251672e-07, "loss": 0.0132, "step": 3103 }, { "epoch": 0.8850869689193043, "grad_norm": 0.6349246018941566, "learning_rate": 3.4246441121781105e-07, "loss": 0.0059, "step": 3104 }, { "epoch": 0.8853721129170231, "grad_norm": 1.3681391815563542, "learning_rate": 3.407864958456092e-07, "loss": 0.016, "step": 3105 }, { "epoch": 0.885657256914742, "grad_norm": 1.3040792173489308, "learning_rate": 3.391125560402764e-07, "loss": 0.0424, "step": 3106 }, { "epoch": 0.8859424009124608, "grad_norm": 1.2333440427099385, "learning_rate": 3.3744259323013453e-07, "loss": 0.0154, "step": 3107 }, { "epoch": 0.8862275449101796, "grad_norm": 1.150041429446506, "learning_rate": 3.357766088401149e-07, "loss": 0.0355, "step": 3108 }, { "epoch": 0.8865126889078985, "grad_norm": 0.5551063042416896, "learning_rate": 3.3411460429174927e-07, "loss": 0.0093, "step": 3109 }, { "epoch": 0.8867978329056173, "grad_norm": 1.2252348783292581, "learning_rate": 3.324565810031777e-07, "loss": 0.0247, "step": 3110 }, { "epoch": 0.8870829769033362, "grad_norm": 0.7969957705002801, "learning_rate": 3.3080254038914014e-07, "loss": 0.0128, "step": 3111 }, { "epoch": 0.887368120901055, "grad_norm": 0.5694148413447736, "learning_rate": 3.2915248386098163e-07, "loss": 0.0049, "step": 3112 }, { "epoch": 0.8876532648987738, "grad_norm": 0.5807428864056374, "learning_rate": 3.275064128266453e-07, "loss": 0.0108, "step": 3113 }, { "epoch": 0.8879384088964928, "grad_norm": 0.9409478396226437, "learning_rate": 3.2586432869067263e-07, "loss": 0.0291, "step": 3114 }, { "epoch": 0.8882235528942116, "grad_norm": 1.9926399501252572, "learning_rate": 3.242262328542067e-07, "loss": 0.0305, "step": 3115 }, { "epoch": 0.8885086968919305, "grad_norm": 0.9596467871013998, "learning_rate": 3.225921267149845e-07, "loss": 0.0175, "step": 3116 }, { "epoch": 0.8887938408896493, "grad_norm": 1.307498365238962, "learning_rate": 3.2096201166734007e-07, "loss": 0.0352, "step": 3117 }, { "epoch": 0.8890789848873681, "grad_norm": 1.5303253426806933, "learning_rate": 3.193358891022008e-07, "loss": 0.0146, "step": 3118 }, { "epoch": 0.889364128885087, "grad_norm": 0.33325411716229314, "learning_rate": 3.17713760407089e-07, "loss": 0.0056, "step": 3119 }, { "epoch": 0.8896492728828058, "grad_norm": 1.538780520413921, "learning_rate": 3.160956269661175e-07, "loss": 0.0188, "step": 3120 }, { "epoch": 0.8899344168805247, "grad_norm": 0.7446784766283835, "learning_rate": 3.1448149015999187e-07, "loss": 0.013, "step": 3121 }, { "epoch": 0.8902195608782435, "grad_norm": 1.3141671766830856, "learning_rate": 3.1287135136600643e-07, "loss": 0.0254, "step": 3122 }, { "epoch": 0.8905047048759623, "grad_norm": 1.4567278490282125, "learning_rate": 3.112652119580428e-07, "loss": 0.0329, "step": 3123 }, { "epoch": 0.8907898488736812, "grad_norm": 1.6857368705721907, "learning_rate": 3.096630733065742e-07, "loss": 0.0288, "step": 3124 }, { "epoch": 0.8910749928714, "grad_norm": 2.0692748305620747, "learning_rate": 3.0806493677865534e-07, "loss": 0.0278, "step": 3125 }, { "epoch": 0.8913601368691189, "grad_norm": 0.7567155805940998, "learning_rate": 3.0647080373792824e-07, "loss": 0.0153, "step": 3126 }, { "epoch": 0.8916452808668378, "grad_norm": 2.5094863114875863, "learning_rate": 3.048806755446182e-07, "loss": 0.0432, "step": 3127 }, { "epoch": 0.8919304248645566, "grad_norm": 1.2661236090125079, "learning_rate": 3.032945535555354e-07, "loss": 0.0616, "step": 3128 }, { "epoch": 0.8922155688622755, "grad_norm": 0.377347821334773, "learning_rate": 3.0171243912406944e-07, "loss": 0.0064, "step": 3129 }, { "epoch": 0.8925007128599943, "grad_norm": 0.7116646851338002, "learning_rate": 3.0013433360019105e-07, "loss": 0.0234, "step": 3130 }, { "epoch": 0.8927858568577132, "grad_norm": 1.280990787410855, "learning_rate": 2.9856023833045033e-07, "loss": 0.0218, "step": 3131 }, { "epoch": 0.893071000855432, "grad_norm": 0.7586342697071281, "learning_rate": 2.969901546579751e-07, "loss": 0.0211, "step": 3132 }, { "epoch": 0.8933561448531508, "grad_norm": 0.8495441832875039, "learning_rate": 2.9542408392247036e-07, "loss": 0.0106, "step": 3133 }, { "epoch": 0.8936412888508697, "grad_norm": 1.7650449817458465, "learning_rate": 2.9386202746021773e-07, "loss": 0.0278, "step": 3134 }, { "epoch": 0.8939264328485885, "grad_norm": 0.7986477079444706, "learning_rate": 2.9230398660407277e-07, "loss": 0.0136, "step": 3135 }, { "epoch": 0.8942115768463074, "grad_norm": 0.8463514580140247, "learning_rate": 2.9074996268346533e-07, "loss": 0.0252, "step": 3136 }, { "epoch": 0.8944967208440262, "grad_norm": 1.3930496907278234, "learning_rate": 2.8919995702439696e-07, "loss": 0.0329, "step": 3137 }, { "epoch": 0.894781864841745, "grad_norm": 1.0464784961095484, "learning_rate": 2.8765397094944083e-07, "loss": 0.0249, "step": 3138 }, { "epoch": 0.8950670088394639, "grad_norm": 1.3983247885259522, "learning_rate": 2.8611200577774e-07, "loss": 0.0182, "step": 3139 }, { "epoch": 0.8953521528371827, "grad_norm": 1.0011589985184415, "learning_rate": 2.845740628250082e-07, "loss": 0.0155, "step": 3140 }, { "epoch": 0.8956372968349017, "grad_norm": 0.41127888337629276, "learning_rate": 2.8304014340352625e-07, "loss": 0.0091, "step": 3141 }, { "epoch": 0.8959224408326205, "grad_norm": 0.7008237115353383, "learning_rate": 2.815102488221394e-07, "loss": 0.0092, "step": 3142 }, { "epoch": 0.8962075848303394, "grad_norm": 1.6731644282839946, "learning_rate": 2.7998438038626174e-07, "loss": 0.0176, "step": 3143 }, { "epoch": 0.8964927288280582, "grad_norm": 1.606249319903414, "learning_rate": 2.7846253939787125e-07, "loss": 0.0415, "step": 3144 }, { "epoch": 0.896777872825777, "grad_norm": 1.300144651734939, "learning_rate": 2.7694472715550925e-07, "loss": 0.0182, "step": 3145 }, { "epoch": 0.8970630168234959, "grad_norm": 0.8102382687014947, "learning_rate": 2.7543094495427913e-07, "loss": 0.0164, "step": 3146 }, { "epoch": 0.8973481608212147, "grad_norm": 0.4796368067100617, "learning_rate": 2.7392119408584493e-07, "loss": 0.0071, "step": 3147 }, { "epoch": 0.8976333048189336, "grad_norm": 0.4708253859579334, "learning_rate": 2.7241547583843286e-07, "loss": 0.0096, "step": 3148 }, { "epoch": 0.8979184488166524, "grad_norm": 1.6685451122274635, "learning_rate": 2.7091379149682683e-07, "loss": 0.0186, "step": 3149 }, { "epoch": 0.8982035928143712, "grad_norm": 1.3143101566649746, "learning_rate": 2.6941614234236857e-07, "loss": 0.0192, "step": 3150 }, { "epoch": 0.8984887368120901, "grad_norm": 0.6686035692072104, "learning_rate": 2.679225296529564e-07, "loss": 0.0135, "step": 3151 }, { "epoch": 0.8987738808098089, "grad_norm": 0.8841163748576049, "learning_rate": 2.664329547030475e-07, "loss": 0.0315, "step": 3152 }, { "epoch": 0.8990590248075278, "grad_norm": 0.9004178197360575, "learning_rate": 2.649474187636492e-07, "loss": 0.0154, "step": 3153 }, { "epoch": 0.8993441688052467, "grad_norm": 2.2456953621383815, "learning_rate": 2.634659231023251e-07, "loss": 0.0293, "step": 3154 }, { "epoch": 0.8996293128029655, "grad_norm": 0.6666588261244653, "learning_rate": 2.619884689831909e-07, "loss": 0.0109, "step": 3155 }, { "epoch": 0.8999144568006844, "grad_norm": 0.8776064374516797, "learning_rate": 2.6051505766691464e-07, "loss": 0.0144, "step": 3156 }, { "epoch": 0.9001996007984032, "grad_norm": 1.1268349745247128, "learning_rate": 2.5904569041071417e-07, "loss": 0.0245, "step": 3157 }, { "epoch": 0.9004847447961221, "grad_norm": 0.4970618468994337, "learning_rate": 2.5758036846835476e-07, "loss": 0.0081, "step": 3158 }, { "epoch": 0.9007698887938409, "grad_norm": 1.0343525333492938, "learning_rate": 2.561190930901519e-07, "loss": 0.0217, "step": 3159 }, { "epoch": 0.9010550327915597, "grad_norm": 1.1597997219850231, "learning_rate": 2.5466186552296933e-07, "loss": 0.029, "step": 3160 }, { "epoch": 0.9013401767892786, "grad_norm": 0.5518918587988524, "learning_rate": 2.532086870102146e-07, "loss": 0.0107, "step": 3161 }, { "epoch": 0.9016253207869974, "grad_norm": 1.555015490765001, "learning_rate": 2.5175955879184146e-07, "loss": 0.0201, "step": 3162 }, { "epoch": 0.9019104647847163, "grad_norm": 0.6795609328872417, "learning_rate": 2.503144821043474e-07, "loss": 0.0142, "step": 3163 }, { "epoch": 0.9021956087824351, "grad_norm": 0.6884525201103593, "learning_rate": 2.488734581807728e-07, "loss": 0.0185, "step": 3164 }, { "epoch": 0.9024807527801539, "grad_norm": 0.5375639386409782, "learning_rate": 2.474364882507002e-07, "loss": 0.0059, "step": 3165 }, { "epoch": 0.9027658967778728, "grad_norm": 0.5080847121580953, "learning_rate": 2.4600357354025275e-07, "loss": 0.0072, "step": 3166 }, { "epoch": 0.9030510407755917, "grad_norm": 0.806467203465794, "learning_rate": 2.4457471527209343e-07, "loss": 0.0246, "step": 3167 }, { "epoch": 0.9033361847733106, "grad_norm": 1.700259776983858, "learning_rate": 2.431499146654243e-07, "loss": 0.0195, "step": 3168 }, { "epoch": 0.9036213287710294, "grad_norm": 1.066316674315371, "learning_rate": 2.4172917293598607e-07, "loss": 0.0332, "step": 3169 }, { "epoch": 0.9039064727687482, "grad_norm": 1.2640803766957995, "learning_rate": 2.4031249129605305e-07, "loss": 0.0148, "step": 3170 }, { "epoch": 0.9041916167664671, "grad_norm": 2.356961541868472, "learning_rate": 2.3889987095443657e-07, "loss": 0.0305, "step": 3171 }, { "epoch": 0.9044767607641859, "grad_norm": 0.8215136443424736, "learning_rate": 2.3749131311648576e-07, "loss": 0.0111, "step": 3172 }, { "epoch": 0.9047619047619048, "grad_norm": 1.21999981493346, "learning_rate": 2.360868189840787e-07, "loss": 0.0162, "step": 3173 }, { "epoch": 0.9050470487596236, "grad_norm": 1.955946628179618, "learning_rate": 2.346863897556295e-07, "loss": 0.033, "step": 3174 }, { "epoch": 0.9053321927573424, "grad_norm": 1.3460408883251782, "learning_rate": 2.3329002662608068e-07, "loss": 0.0461, "step": 3175 }, { "epoch": 0.9056173367550613, "grad_norm": 1.1009867392611814, "learning_rate": 2.3189773078690748e-07, "loss": 0.0119, "step": 3176 }, { "epoch": 0.9059024807527801, "grad_norm": 0.6976449459445861, "learning_rate": 2.305095034261151e-07, "loss": 0.0104, "step": 3177 }, { "epoch": 0.906187624750499, "grad_norm": 1.050109166261638, "learning_rate": 2.2912534572823498e-07, "loss": 0.0226, "step": 3178 }, { "epoch": 0.9064727687482178, "grad_norm": 0.6801788595908712, "learning_rate": 2.2774525887432786e-07, "loss": 0.0185, "step": 3179 }, { "epoch": 0.9067579127459366, "grad_norm": 0.6869437524292872, "learning_rate": 2.2636924404198014e-07, "loss": 0.0104, "step": 3180 }, { "epoch": 0.9070430567436556, "grad_norm": 1.4278818202476977, "learning_rate": 2.2499730240530426e-07, "loss": 0.0239, "step": 3181 }, { "epoch": 0.9073282007413744, "grad_norm": 0.6426812316033088, "learning_rate": 2.2362943513493662e-07, "loss": 0.0164, "step": 3182 }, { "epoch": 0.9076133447390933, "grad_norm": 0.7907561871363531, "learning_rate": 2.2226564339803636e-07, "loss": 0.0096, "step": 3183 }, { "epoch": 0.9078984887368121, "grad_norm": 0.9117928880337626, "learning_rate": 2.2090592835828817e-07, "loss": 0.017, "step": 3184 }, { "epoch": 0.908183632734531, "grad_norm": 0.9775102371730449, "learning_rate": 2.1955029117589454e-07, "loss": 0.0145, "step": 3185 }, { "epoch": 0.9084687767322498, "grad_norm": 0.6325452457433002, "learning_rate": 2.1819873300758022e-07, "loss": 0.0088, "step": 3186 }, { "epoch": 0.9087539207299686, "grad_norm": 0.8958197615264045, "learning_rate": 2.168512550065882e-07, "loss": 0.0312, "step": 3187 }, { "epoch": 0.9090390647276875, "grad_norm": 0.6725945959693137, "learning_rate": 2.1550785832268217e-07, "loss": 0.0147, "step": 3188 }, { "epoch": 0.9093242087254063, "grad_norm": 0.7200310122010458, "learning_rate": 2.1416854410214183e-07, "loss": 0.009, "step": 3189 }, { "epoch": 0.9096093527231252, "grad_norm": 1.3177205015746256, "learning_rate": 2.1283331348776414e-07, "loss": 0.0207, "step": 3190 }, { "epoch": 0.909894496720844, "grad_norm": 1.6289098120134233, "learning_rate": 2.115021676188611e-07, "loss": 0.0245, "step": 3191 }, { "epoch": 0.9101796407185628, "grad_norm": 1.9024020760670197, "learning_rate": 2.101751076312586e-07, "loss": 0.0367, "step": 3192 }, { "epoch": 0.9104647847162817, "grad_norm": 1.1389262118067653, "learning_rate": 2.0885213465729802e-07, "loss": 0.0238, "step": 3193 }, { "epoch": 0.9107499287140006, "grad_norm": 2.0058186055618106, "learning_rate": 2.0753324982583202e-07, "loss": 0.0419, "step": 3194 }, { "epoch": 0.9110350727117195, "grad_norm": 1.2169233475560073, "learning_rate": 2.0621845426222587e-07, "loss": 0.0326, "step": 3195 }, { "epoch": 0.9113202167094383, "grad_norm": 0.3754420977930853, "learning_rate": 2.0490774908835442e-07, "loss": 0.0057, "step": 3196 }, { "epoch": 0.9116053607071571, "grad_norm": 2.673609902784827, "learning_rate": 2.0360113542260307e-07, "loss": 0.0349, "step": 3197 }, { "epoch": 0.911890504704876, "grad_norm": 1.0286839819383613, "learning_rate": 2.0229861437986665e-07, "loss": 0.0309, "step": 3198 }, { "epoch": 0.9121756487025948, "grad_norm": 1.100305443067917, "learning_rate": 2.0100018707154612e-07, "loss": 0.0127, "step": 3199 }, { "epoch": 0.9124607927003137, "grad_norm": 0.9752425027943912, "learning_rate": 1.9970585460555193e-07, "loss": 0.0114, "step": 3200 }, { "epoch": 0.9127459366980325, "grad_norm": 1.5804332245975838, "learning_rate": 1.984156180862984e-07, "loss": 0.036, "step": 3201 }, { "epoch": 0.9130310806957513, "grad_norm": 1.2405391166369548, "learning_rate": 1.97129478614706e-07, "loss": 0.0198, "step": 3202 }, { "epoch": 0.9133162246934702, "grad_norm": 1.4988667995021159, "learning_rate": 1.958474372881969e-07, "loss": 0.0165, "step": 3203 }, { "epoch": 0.913601368691189, "grad_norm": 2.6758434050413924, "learning_rate": 1.94569495200701e-07, "loss": 0.0473, "step": 3204 }, { "epoch": 0.9138865126889079, "grad_norm": 0.37648975473173746, "learning_rate": 1.9329565344264666e-07, "loss": 0.009, "step": 3205 }, { "epoch": 0.9141716566866267, "grad_norm": 1.4270330071786936, "learning_rate": 1.9202591310096495e-07, "loss": 0.0318, "step": 3206 }, { "epoch": 0.9144568006843456, "grad_norm": 1.1284641851442696, "learning_rate": 1.9076027525908704e-07, "loss": 0.0169, "step": 3207 }, { "epoch": 0.9147419446820645, "grad_norm": 0.22940010951660872, "learning_rate": 1.8949874099694344e-07, "loss": 0.0054, "step": 3208 }, { "epoch": 0.9150270886797833, "grad_norm": 0.472117646501281, "learning_rate": 1.8824131139096424e-07, "loss": 0.013, "step": 3209 }, { "epoch": 0.9153122326775022, "grad_norm": 1.0145391952231864, "learning_rate": 1.8698798751407566e-07, "loss": 0.0163, "step": 3210 }, { "epoch": 0.915597376675221, "grad_norm": 1.6566652751095565, "learning_rate": 1.8573877043570166e-07, "loss": 0.0293, "step": 3211 }, { "epoch": 0.9158825206729398, "grad_norm": 0.8730480588302432, "learning_rate": 1.8449366122176072e-07, "loss": 0.0211, "step": 3212 }, { "epoch": 0.9161676646706587, "grad_norm": 0.762722802208298, "learning_rate": 1.8325266093466908e-07, "loss": 0.0167, "step": 3213 }, { "epoch": 0.9164528086683775, "grad_norm": 1.8519175215823354, "learning_rate": 1.820157706333331e-07, "loss": 0.042, "step": 3214 }, { "epoch": 0.9167379526660964, "grad_norm": 0.6178345429669876, "learning_rate": 1.8078299137315457e-07, "loss": 0.0115, "step": 3215 }, { "epoch": 0.9170230966638152, "grad_norm": 1.8389032255626088, "learning_rate": 1.7955432420602714e-07, "loss": 0.0449, "step": 3216 }, { "epoch": 0.917308240661534, "grad_norm": 0.8976456881308423, "learning_rate": 1.7832977018033604e-07, "loss": 0.0146, "step": 3217 }, { "epoch": 0.9175933846592529, "grad_norm": 1.1641398023584142, "learning_rate": 1.7710933034095658e-07, "loss": 0.0167, "step": 3218 }, { "epoch": 0.9178785286569717, "grad_norm": 0.34100280170072866, "learning_rate": 1.7589300572925184e-07, "loss": 0.0044, "step": 3219 }, { "epoch": 0.9181636726546906, "grad_norm": 1.7683542198735762, "learning_rate": 1.7468079738307608e-07, "loss": 0.0254, "step": 3220 }, { "epoch": 0.9184488166524095, "grad_norm": 0.9779660998455367, "learning_rate": 1.7347270633677082e-07, "loss": 0.0247, "step": 3221 }, { "epoch": 0.9187339606501284, "grad_norm": 0.7604628137905527, "learning_rate": 1.722687336211626e-07, "loss": 0.016, "step": 3222 }, { "epoch": 0.9190191046478472, "grad_norm": 1.9616843025886015, "learning_rate": 1.7106888026356626e-07, "loss": 0.0218, "step": 3223 }, { "epoch": 0.919304248645566, "grad_norm": 1.4389747508794408, "learning_rate": 1.6987314728778014e-07, "loss": 0.1148, "step": 3224 }, { "epoch": 0.9195893926432849, "grad_norm": 1.3027980674511903, "learning_rate": 1.6868153571408695e-07, "loss": 0.0248, "step": 3225 }, { "epoch": 0.9198745366410037, "grad_norm": 0.4343153104128488, "learning_rate": 1.6749404655925338e-07, "loss": 0.009, "step": 3226 }, { "epoch": 0.9201596806387226, "grad_norm": 0.47044929809354835, "learning_rate": 1.663106808365289e-07, "loss": 0.0077, "step": 3227 }, { "epoch": 0.9204448246364414, "grad_norm": 1.7245884546851775, "learning_rate": 1.6513143955564192e-07, "loss": 0.0386, "step": 3228 }, { "epoch": 0.9207299686341602, "grad_norm": 1.2986893334701157, "learning_rate": 1.6395632372280646e-07, "loss": 0.0219, "step": 3229 }, { "epoch": 0.9210151126318791, "grad_norm": 1.1072209359887417, "learning_rate": 1.62785334340711e-07, "loss": 0.0162, "step": 3230 }, { "epoch": 0.9213002566295979, "grad_norm": 1.116053157390405, "learning_rate": 1.6161847240852624e-07, "loss": 0.0133, "step": 3231 }, { "epoch": 0.9215854006273168, "grad_norm": 0.6591215243411573, "learning_rate": 1.6045573892190136e-07, "loss": 0.0161, "step": 3232 }, { "epoch": 0.9218705446250356, "grad_norm": 0.44109595747664676, "learning_rate": 1.5929713487296162e-07, "loss": 0.0055, "step": 3233 }, { "epoch": 0.9221556886227545, "grad_norm": 1.3398968208505428, "learning_rate": 1.581426612503084e-07, "loss": 0.0225, "step": 3234 }, { "epoch": 0.9224408326204734, "grad_norm": 1.4579154403674046, "learning_rate": 1.5699231903901934e-07, "loss": 0.0216, "step": 3235 }, { "epoch": 0.9227259766181922, "grad_norm": 2.4188092429370704, "learning_rate": 1.5584610922064759e-07, "loss": 0.0523, "step": 3236 }, { "epoch": 0.9230111206159111, "grad_norm": 0.5884136082750657, "learning_rate": 1.547040327732191e-07, "loss": 0.0091, "step": 3237 }, { "epoch": 0.9232962646136299, "grad_norm": 1.108590752242791, "learning_rate": 1.535660906712333e-07, "loss": 0.009, "step": 3238 }, { "epoch": 0.9235814086113487, "grad_norm": 1.3748359488240862, "learning_rate": 1.5243228388566233e-07, "loss": 0.0145, "step": 3239 }, { "epoch": 0.9238665526090676, "grad_norm": 0.20338331633106474, "learning_rate": 1.5130261338394904e-07, "loss": 0.004, "step": 3240 }, { "epoch": 0.9241516966067864, "grad_norm": 0.6798693978374936, "learning_rate": 1.5017708013000787e-07, "loss": 0.02, "step": 3241 }, { "epoch": 0.9244368406045053, "grad_norm": 1.2945381953335924, "learning_rate": 1.4905568508422173e-07, "loss": 0.0221, "step": 3242 }, { "epoch": 0.9247219846022241, "grad_norm": 1.1088855335116197, "learning_rate": 1.4793842920344358e-07, "loss": 0.0173, "step": 3243 }, { "epoch": 0.9250071285999429, "grad_norm": 0.6902543763473945, "learning_rate": 1.468253134409947e-07, "loss": 0.0078, "step": 3244 }, { "epoch": 0.9252922725976618, "grad_norm": 0.36426685758935945, "learning_rate": 1.4571633874666313e-07, "loss": 0.0068, "step": 3245 }, { "epoch": 0.9255774165953806, "grad_norm": 1.089534630630725, "learning_rate": 1.4461150606670414e-07, "loss": 0.0283, "step": 3246 }, { "epoch": 0.9258625605930996, "grad_norm": 1.1642004537200386, "learning_rate": 1.4351081634383647e-07, "loss": 0.0257, "step": 3247 }, { "epoch": 0.9261477045908184, "grad_norm": 0.7868438826662598, "learning_rate": 1.4241427051724765e-07, "loss": 0.0103, "step": 3248 }, { "epoch": 0.9264328485885372, "grad_norm": 0.9770444382275083, "learning_rate": 1.4132186952258653e-07, "loss": 0.0153, "step": 3249 }, { "epoch": 0.9267179925862561, "grad_norm": 1.785958725620067, "learning_rate": 1.402336142919658e-07, "loss": 0.0264, "step": 3250 }, { "epoch": 0.9270031365839749, "grad_norm": 1.2964838487983503, "learning_rate": 1.3914950575396102e-07, "loss": 0.0267, "step": 3251 }, { "epoch": 0.9272882805816938, "grad_norm": 1.6603947739477958, "learning_rate": 1.3806954483361002e-07, "loss": 0.0208, "step": 3252 }, { "epoch": 0.9275734245794126, "grad_norm": 1.9307212298456977, "learning_rate": 1.369937324524101e-07, "loss": 0.0458, "step": 3253 }, { "epoch": 0.9278585685771314, "grad_norm": 3.4936463480149653, "learning_rate": 1.3592206952832031e-07, "loss": 0.0591, "step": 3254 }, { "epoch": 0.9281437125748503, "grad_norm": 1.3343590595906798, "learning_rate": 1.3485455697575755e-07, "loss": 0.0246, "step": 3255 }, { "epoch": 0.9284288565725691, "grad_norm": 1.4183852372661856, "learning_rate": 1.3379119570559872e-07, "loss": 0.0362, "step": 3256 }, { "epoch": 0.928714000570288, "grad_norm": 1.540850659243285, "learning_rate": 1.3273198662517917e-07, "loss": 0.0269, "step": 3257 }, { "epoch": 0.9289991445680068, "grad_norm": 1.5560860076422636, "learning_rate": 1.3167693063828867e-07, "loss": 0.0163, "step": 3258 }, { "epoch": 0.9292842885657256, "grad_norm": 1.484976547707544, "learning_rate": 1.3062602864517548e-07, "loss": 0.0472, "step": 3259 }, { "epoch": 0.9295694325634445, "grad_norm": 0.6381737168749684, "learning_rate": 1.2957928154254174e-07, "loss": 0.0107, "step": 3260 }, { "epoch": 0.9298545765611634, "grad_norm": 1.1036012557283326, "learning_rate": 1.285366902235463e-07, "loss": 0.0258, "step": 3261 }, { "epoch": 0.9301397205588823, "grad_norm": 1.9037929843957349, "learning_rate": 1.2749825557780148e-07, "loss": 0.0647, "step": 3262 }, { "epoch": 0.9304248645566011, "grad_norm": 1.1279142271655538, "learning_rate": 1.264639784913707e-07, "loss": 0.0206, "step": 3263 }, { "epoch": 0.93071000855432, "grad_norm": 1.3538375710084851, "learning_rate": 1.2543385984677249e-07, "loss": 0.0294, "step": 3264 }, { "epoch": 0.9309951525520388, "grad_norm": 1.5857087537071777, "learning_rate": 1.2440790052297648e-07, "loss": 0.0262, "step": 3265 }, { "epoch": 0.9312802965497576, "grad_norm": 0.8358164956623412, "learning_rate": 1.233861013954024e-07, "loss": 0.0128, "step": 3266 }, { "epoch": 0.9315654405474765, "grad_norm": 1.0889195632514523, "learning_rate": 1.2236846333592068e-07, "loss": 0.0161, "step": 3267 }, { "epoch": 0.9318505845451953, "grad_norm": 1.3316230123061, "learning_rate": 1.2135498721285167e-07, "loss": 0.0568, "step": 3268 }, { "epoch": 0.9321357285429142, "grad_norm": 1.1761990589238678, "learning_rate": 1.2034567389096364e-07, "loss": 0.0179, "step": 3269 }, { "epoch": 0.932420872540633, "grad_norm": 1.460756188712811, "learning_rate": 1.193405242314738e-07, "loss": 0.0242, "step": 3270 }, { "epoch": 0.9327060165383518, "grad_norm": 1.4799019491857686, "learning_rate": 1.1833953909204554e-07, "loss": 0.0227, "step": 3271 }, { "epoch": 0.9329911605360707, "grad_norm": 1.500459177807598, "learning_rate": 1.1734271932679008e-07, "loss": 0.0475, "step": 3272 }, { "epoch": 0.9332763045337895, "grad_norm": 1.4260297811235727, "learning_rate": 1.1635006578626374e-07, "loss": 0.0249, "step": 3273 }, { "epoch": 0.9335614485315085, "grad_norm": 2.2936699937794636, "learning_rate": 1.1536157931746728e-07, "loss": 0.049, "step": 3274 }, { "epoch": 0.9338465925292273, "grad_norm": 1.3381636184621566, "learning_rate": 1.1437726076384715e-07, "loss": 0.0273, "step": 3275 }, { "epoch": 0.9341317365269461, "grad_norm": 2.3124956924737776, "learning_rate": 1.1339711096529149e-07, "loss": 0.0484, "step": 3276 }, { "epoch": 0.934416880524665, "grad_norm": 1.3552422519009062, "learning_rate": 1.1242113075813466e-07, "loss": 0.0544, "step": 3277 }, { "epoch": 0.9347020245223838, "grad_norm": 0.4791576527546003, "learning_rate": 1.1144932097515048e-07, "loss": 0.0083, "step": 3278 }, { "epoch": 0.9349871685201027, "grad_norm": 1.1383690407288758, "learning_rate": 1.1048168244555513e-07, "loss": 0.0321, "step": 3279 }, { "epoch": 0.9352723125178215, "grad_norm": 0.5018323683744155, "learning_rate": 1.0951821599500423e-07, "loss": 0.0099, "step": 3280 }, { "epoch": 0.9355574565155403, "grad_norm": 0.38194627856193947, "learning_rate": 1.0855892244559573e-07, "loss": 0.0058, "step": 3281 }, { "epoch": 0.9358426005132592, "grad_norm": 1.7992899459503027, "learning_rate": 1.0760380261586656e-07, "loss": 0.022, "step": 3282 }, { "epoch": 0.936127744510978, "grad_norm": 1.608919056354979, "learning_rate": 1.0665285732079145e-07, "loss": 0.0373, "step": 3283 }, { "epoch": 0.9364128885086969, "grad_norm": 0.5295751980400589, "learning_rate": 1.0570608737178245e-07, "loss": 0.0143, "step": 3284 }, { "epoch": 0.9366980325064157, "grad_norm": 1.0884180574425513, "learning_rate": 1.0476349357669113e-07, "loss": 0.0176, "step": 3285 }, { "epoch": 0.9369831765041345, "grad_norm": 0.8642293290298916, "learning_rate": 1.0382507673980358e-07, "loss": 0.0239, "step": 3286 }, { "epoch": 0.9372683205018535, "grad_norm": 1.0721029415636114, "learning_rate": 1.0289083766184371e-07, "loss": 0.0125, "step": 3287 }, { "epoch": 0.9375534644995723, "grad_norm": 1.8058621968187927, "learning_rate": 1.0196077713996777e-07, "loss": 0.0268, "step": 3288 }, { "epoch": 0.9378386084972912, "grad_norm": 1.6236022683941318, "learning_rate": 1.0103489596777094e-07, "loss": 0.0507, "step": 3289 }, { "epoch": 0.93812375249501, "grad_norm": 0.3847928622518018, "learning_rate": 1.0011319493527849e-07, "loss": 0.0048, "step": 3290 }, { "epoch": 0.9384088964927289, "grad_norm": 0.5354601853769945, "learning_rate": 9.919567482894965e-08, "loss": 0.0138, "step": 3291 }, { "epoch": 0.9386940404904477, "grad_norm": 1.707818158964234, "learning_rate": 9.828233643167762e-08, "loss": 0.0157, "step": 3292 }, { "epoch": 0.9389791844881665, "grad_norm": 0.8286681216996449, "learning_rate": 9.737318052278622e-08, "loss": 0.0227, "step": 3293 }, { "epoch": 0.9392643284858854, "grad_norm": 1.638770990611571, "learning_rate": 9.646820787803102e-08, "loss": 0.0238, "step": 3294 }, { "epoch": 0.9395494724836042, "grad_norm": 1.4844955571277005, "learning_rate": 9.556741926959878e-08, "loss": 0.0251, "step": 3295 }, { "epoch": 0.939834616481323, "grad_norm": 1.2669550894482697, "learning_rate": 9.467081546610357e-08, "loss": 0.0279, "step": 3296 }, { "epoch": 0.9401197604790419, "grad_norm": 0.607099774823454, "learning_rate": 9.377839723259175e-08, "loss": 0.0121, "step": 3297 }, { "epoch": 0.9404049044767607, "grad_norm": 0.8781917053000157, "learning_rate": 9.289016533053696e-08, "loss": 0.0104, "step": 3298 }, { "epoch": 0.9406900484744796, "grad_norm": 1.3542338084425016, "learning_rate": 9.200612051784019e-08, "loss": 0.0167, "step": 3299 }, { "epoch": 0.9409751924721984, "grad_norm": 1.1515026257845251, "learning_rate": 9.112626354883025e-08, "loss": 0.0166, "step": 3300 }, { "epoch": 0.9412603364699174, "grad_norm": 1.2561544765269976, "learning_rate": 9.025059517426383e-08, "loss": 0.0202, "step": 3301 }, { "epoch": 0.9415454804676362, "grad_norm": 0.8869786585507469, "learning_rate": 8.937911614132155e-08, "loss": 0.0113, "step": 3302 }, { "epoch": 0.941830624465355, "grad_norm": 1.1028012235771263, "learning_rate": 8.851182719361029e-08, "loss": 0.0213, "step": 3303 }, { "epoch": 0.9421157684630739, "grad_norm": 0.8016773289830269, "learning_rate": 8.764872907116084e-08, "loss": 0.0126, "step": 3304 }, { "epoch": 0.9424009124607927, "grad_norm": 2.1285540435739647, "learning_rate": 8.678982251043078e-08, "loss": 0.0291, "step": 3305 }, { "epoch": 0.9426860564585116, "grad_norm": 0.5550319327426171, "learning_rate": 8.59351082442983e-08, "loss": 0.0102, "step": 3306 }, { "epoch": 0.9429712004562304, "grad_norm": 0.8827415276408771, "learning_rate": 8.508458700206501e-08, "loss": 0.0167, "step": 3307 }, { "epoch": 0.9432563444539492, "grad_norm": 1.0858933176242238, "learning_rate": 8.423825950945541e-08, "loss": 0.0139, "step": 3308 }, { "epoch": 0.9435414884516681, "grad_norm": 0.9312052457091772, "learning_rate": 8.339612648861573e-08, "loss": 0.0189, "step": 3309 }, { "epoch": 0.9438266324493869, "grad_norm": 0.8767853437349821, "learning_rate": 8.255818865811226e-08, "loss": 0.0209, "step": 3310 }, { "epoch": 0.9441117764471058, "grad_norm": 1.1912037158829953, "learning_rate": 8.172444673293201e-08, "loss": 0.0154, "step": 3311 }, { "epoch": 0.9443969204448246, "grad_norm": 1.1446724760051077, "learning_rate": 8.089490142448254e-08, "loss": 0.0159, "step": 3312 }, { "epoch": 0.9446820644425434, "grad_norm": 1.254371535428258, "learning_rate": 8.006955344058986e-08, "loss": 0.0203, "step": 3313 }, { "epoch": 0.9449672084402624, "grad_norm": 0.9243205335182174, "learning_rate": 7.92484034854979e-08, "loss": 0.0147, "step": 3314 }, { "epoch": 0.9452523524379812, "grad_norm": 0.5842500345183507, "learning_rate": 7.843145225987003e-08, "loss": 0.0075, "step": 3315 }, { "epoch": 0.9455374964357001, "grad_norm": 1.493712093765859, "learning_rate": 7.761870046078534e-08, "loss": 0.0231, "step": 3316 }, { "epoch": 0.9458226404334189, "grad_norm": 1.162136015617278, "learning_rate": 7.681014878174187e-08, "loss": 0.0321, "step": 3317 }, { "epoch": 0.9461077844311377, "grad_norm": 0.5474839381848375, "learning_rate": 7.600579791265161e-08, "loss": 0.0117, "step": 3318 }, { "epoch": 0.9463929284288566, "grad_norm": 1.3755166848139597, "learning_rate": 7.52056485398428e-08, "loss": 0.0244, "step": 3319 }, { "epoch": 0.9466780724265754, "grad_norm": 1.3943608791399622, "learning_rate": 7.440970134605819e-08, "loss": 0.0342, "step": 3320 }, { "epoch": 0.9469632164242943, "grad_norm": 0.9763179433503778, "learning_rate": 7.361795701045726e-08, "loss": 0.0233, "step": 3321 }, { "epoch": 0.9472483604220131, "grad_norm": 0.8089246096238166, "learning_rate": 7.283041620861131e-08, "loss": 0.0127, "step": 3322 }, { "epoch": 0.9475335044197319, "grad_norm": 0.7679809907075772, "learning_rate": 7.204707961250446e-08, "loss": 0.0156, "step": 3323 }, { "epoch": 0.9478186484174508, "grad_norm": 0.9473437484277629, "learning_rate": 7.126794789053426e-08, "loss": 0.0147, "step": 3324 }, { "epoch": 0.9481037924151696, "grad_norm": 1.38358551291656, "learning_rate": 7.049302170751115e-08, "loss": 0.0567, "step": 3325 }, { "epoch": 0.9483889364128885, "grad_norm": 1.2621054129425233, "learning_rate": 6.972230172465567e-08, "loss": 0.0231, "step": 3326 }, { "epoch": 0.9486740804106074, "grad_norm": 0.3297509743111443, "learning_rate": 6.895578859960062e-08, "loss": 0.0072, "step": 3327 }, { "epoch": 0.9489592244083263, "grad_norm": 0.2484790392335108, "learning_rate": 6.819348298638839e-08, "loss": 0.0045, "step": 3328 }, { "epoch": 0.9492443684060451, "grad_norm": 0.5873609332531889, "learning_rate": 6.743538553547091e-08, "loss": 0.0107, "step": 3329 }, { "epoch": 0.9495295124037639, "grad_norm": 1.7018257018683238, "learning_rate": 6.668149689371074e-08, "loss": 0.0432, "step": 3330 }, { "epoch": 0.9498146564014828, "grad_norm": 0.5246171733900967, "learning_rate": 6.593181770437829e-08, "loss": 0.0068, "step": 3331 }, { "epoch": 0.9500998003992016, "grad_norm": 0.7763822843561252, "learning_rate": 6.518634860715134e-08, "loss": 0.0162, "step": 3332 }, { "epoch": 0.9503849443969205, "grad_norm": 1.0168579745893171, "learning_rate": 6.444509023811773e-08, "loss": 0.0209, "step": 3333 }, { "epoch": 0.9506700883946393, "grad_norm": 1.075777400723064, "learning_rate": 6.370804322977042e-08, "loss": 0.0189, "step": 3334 }, { "epoch": 0.9509552323923581, "grad_norm": 1.5206330880639494, "learning_rate": 6.297520821100911e-08, "loss": 0.0462, "step": 3335 }, { "epoch": 0.951240376390077, "grad_norm": 1.5897941053837084, "learning_rate": 6.224658580713971e-08, "loss": 0.0238, "step": 3336 }, { "epoch": 0.9515255203877958, "grad_norm": 2.1685547816279316, "learning_rate": 6.152217663987437e-08, "loss": 0.0302, "step": 3337 }, { "epoch": 0.9518106643855146, "grad_norm": 0.41072290294514274, "learning_rate": 6.080198132732917e-08, "loss": 0.0062, "step": 3338 }, { "epoch": 0.9520958083832335, "grad_norm": 1.6595471882531296, "learning_rate": 6.008600048402647e-08, "loss": 0.0236, "step": 3339 }, { "epoch": 0.9523809523809523, "grad_norm": 1.3876995960785814, "learning_rate": 5.937423472088866e-08, "loss": 0.0173, "step": 3340 }, { "epoch": 0.9526660963786713, "grad_norm": 0.95914211608916, "learning_rate": 5.866668464524661e-08, "loss": 0.0146, "step": 3341 }, { "epoch": 0.9529512403763901, "grad_norm": 2.367073977432817, "learning_rate": 5.796335086083016e-08, "loss": 0.0281, "step": 3342 }, { "epoch": 0.953236384374109, "grad_norm": 0.8209213096210055, "learning_rate": 5.7264233967773696e-08, "loss": 0.0127, "step": 3343 }, { "epoch": 0.9535215283718278, "grad_norm": 1.8947375988470225, "learning_rate": 5.6569334562611714e-08, "loss": 0.0205, "step": 3344 }, { "epoch": 0.9538066723695466, "grad_norm": 1.043061161634943, "learning_rate": 5.5878653238281564e-08, "loss": 0.0209, "step": 3345 }, { "epoch": 0.9540918163672655, "grad_norm": 0.8863361955005523, "learning_rate": 5.519219058412129e-08, "loss": 0.0098, "step": 3346 }, { "epoch": 0.9543769603649843, "grad_norm": 0.5971170239016343, "learning_rate": 5.4509947185867883e-08, "loss": 0.0162, "step": 3347 }, { "epoch": 0.9546621043627032, "grad_norm": 1.117922239866681, "learning_rate": 5.3831923625659034e-08, "loss": 0.0235, "step": 3348 }, { "epoch": 0.954947248360422, "grad_norm": 0.4746383532243566, "learning_rate": 5.315812048203306e-08, "loss": 0.0081, "step": 3349 }, { "epoch": 0.9552323923581408, "grad_norm": 0.7993643810754686, "learning_rate": 5.2488538329926175e-08, "loss": 0.0214, "step": 3350 }, { "epoch": 0.9555175363558597, "grad_norm": 1.4852695501324207, "learning_rate": 5.18231777406708e-08, "loss": 0.032, "step": 3351 }, { "epoch": 0.9558026803535785, "grad_norm": 2.15399813077587, "learning_rate": 5.116203928200003e-08, "loss": 0.056, "step": 3352 }, { "epoch": 0.9560878243512974, "grad_norm": 1.4790577170889294, "learning_rate": 5.050512351804371e-08, "loss": 0.035, "step": 3353 }, { "epoch": 0.9563729683490163, "grad_norm": 1.5138221982520688, "learning_rate": 4.9852431009328464e-08, "loss": 0.0206, "step": 3354 }, { "epoch": 0.9566581123467351, "grad_norm": 1.2976796226213914, "learning_rate": 4.920396231277713e-08, "loss": 0.0185, "step": 3355 }, { "epoch": 0.956943256344454, "grad_norm": 0.8940591579415054, "learning_rate": 4.855971798170822e-08, "loss": 0.0161, "step": 3356 }, { "epoch": 0.9572284003421728, "grad_norm": 1.3689820258335235, "learning_rate": 4.7919698565835894e-08, "loss": 0.0346, "step": 3357 }, { "epoch": 0.9575135443398917, "grad_norm": 0.6602976007376902, "learning_rate": 4.728390461126997e-08, "loss": 0.0135, "step": 3358 }, { "epoch": 0.9577986883376105, "grad_norm": 1.7405971970346032, "learning_rate": 4.6652336660514275e-08, "loss": 0.0391, "step": 3359 }, { "epoch": 0.9580838323353293, "grad_norm": 0.7948682203377249, "learning_rate": 4.602499525246606e-08, "loss": 0.0095, "step": 3360 }, { "epoch": 0.9583689763330482, "grad_norm": 2.246065889190937, "learning_rate": 4.5401880922418264e-08, "loss": 0.0295, "step": 3361 }, { "epoch": 0.958654120330767, "grad_norm": 0.998355740167951, "learning_rate": 4.478299420205445e-08, "loss": 0.0134, "step": 3362 }, { "epoch": 0.9589392643284859, "grad_norm": 1.4618361724398996, "learning_rate": 4.416833561945222e-08, "loss": 0.0361, "step": 3363 }, { "epoch": 0.9592244083262047, "grad_norm": 1.3947578420400226, "learning_rate": 4.355790569908147e-08, "loss": 0.0316, "step": 3364 }, { "epoch": 0.9595095523239235, "grad_norm": 1.092207434346036, "learning_rate": 4.29517049618039e-08, "loss": 0.0205, "step": 3365 }, { "epoch": 0.9597946963216424, "grad_norm": 1.2842978731112948, "learning_rate": 4.2349733924872406e-08, "loss": 0.0261, "step": 3366 }, { "epoch": 0.9600798403193613, "grad_norm": 1.9319381855432591, "learning_rate": 4.1751993101930565e-08, "loss": 0.0313, "step": 3367 }, { "epoch": 0.9603649843170802, "grad_norm": 1.9881438705501773, "learning_rate": 4.1158483003012614e-08, "loss": 0.0338, "step": 3368 }, { "epoch": 0.960650128314799, "grad_norm": 1.5737829057522759, "learning_rate": 4.056920413454291e-08, "loss": 0.0237, "step": 3369 }, { "epoch": 0.9609352723125179, "grad_norm": 1.000096415972, "learning_rate": 3.9984156999335334e-08, "loss": 0.0145, "step": 3370 }, { "epoch": 0.9612204163102367, "grad_norm": 1.35746116703504, "learning_rate": 3.94033420965928e-08, "loss": 0.015, "step": 3371 }, { "epoch": 0.9615055603079555, "grad_norm": 2.149928406053472, "learning_rate": 3.882675992190832e-08, "loss": 0.0354, "step": 3372 }, { "epoch": 0.9617907043056744, "grad_norm": 0.7299966913484716, "learning_rate": 3.825441096726057e-08, "loss": 0.008, "step": 3373 }, { "epoch": 0.9620758483033932, "grad_norm": 0.3263502956977538, "learning_rate": 3.76862957210189e-08, "loss": 0.004, "step": 3374 }, { "epoch": 0.962360992301112, "grad_norm": 1.2745281137121607, "learning_rate": 3.712241466793887e-08, "loss": 0.0148, "step": 3375 }, { "epoch": 0.9626461362988309, "grad_norm": 1.0802932953346063, "learning_rate": 3.6562768289162834e-08, "loss": 0.0155, "step": 3376 }, { "epoch": 0.9629312802965497, "grad_norm": 0.8278153218304807, "learning_rate": 3.6007357062219914e-08, "loss": 0.0122, "step": 3377 }, { "epoch": 0.9632164242942686, "grad_norm": 0.22430709604137844, "learning_rate": 3.5456181461026585e-08, "loss": 0.0042, "step": 3378 }, { "epoch": 0.9635015682919874, "grad_norm": 0.8451639389040133, "learning_rate": 3.4909241955883853e-08, "loss": 0.014, "step": 3379 }, { "epoch": 0.9637867122897062, "grad_norm": 1.3080849693612018, "learning_rate": 3.4366539013478975e-08, "loss": 0.024, "step": 3380 }, { "epoch": 0.9640718562874252, "grad_norm": 1.4925602998353593, "learning_rate": 3.3828073096884294e-08, "loss": 0.0229, "step": 3381 }, { "epoch": 0.964357000285144, "grad_norm": 0.9866730078294964, "learning_rate": 3.329384466555619e-08, "loss": 0.0332, "step": 3382 }, { "epoch": 0.9646421442828629, "grad_norm": 0.53963382342648, "learning_rate": 3.276385417533612e-08, "loss": 0.0076, "step": 3383 }, { "epoch": 0.9649272882805817, "grad_norm": 2.028169739046422, "learning_rate": 3.2238102078448466e-08, "loss": 0.0525, "step": 3384 }, { "epoch": 0.9652124322783006, "grad_norm": 0.70221035179003, "learning_rate": 3.17165888235027e-08, "loss": 0.0147, "step": 3385 }, { "epoch": 0.9654975762760194, "grad_norm": 1.594502446966948, "learning_rate": 3.1199314855489547e-08, "loss": 0.0386, "step": 3386 }, { "epoch": 0.9657827202737382, "grad_norm": 0.6155279599436443, "learning_rate": 3.0686280615783156e-08, "loss": 0.007, "step": 3387 }, { "epoch": 0.9660678642714571, "grad_norm": 0.9940196228788715, "learning_rate": 3.0177486542141144e-08, "loss": 0.0181, "step": 3388 }, { "epoch": 0.9663530082691759, "grad_norm": 0.9662957584867693, "learning_rate": 2.9672933068701227e-08, "loss": 0.0191, "step": 3389 }, { "epoch": 0.9666381522668948, "grad_norm": 1.3090922190101484, "learning_rate": 2.9172620625984583e-08, "loss": 0.0244, "step": 3390 }, { "epoch": 0.9669232962646136, "grad_norm": 0.8123264288075666, "learning_rate": 2.8676549640892502e-08, "loss": 0.015, "step": 3391 }, { "epoch": 0.9672084402623324, "grad_norm": 0.8232507163935322, "learning_rate": 2.8184720536706956e-08, "loss": 0.0161, "step": 3392 }, { "epoch": 0.9674935842600513, "grad_norm": 0.6151000895965828, "learning_rate": 2.7697133733091686e-08, "loss": 0.01, "step": 3393 }, { "epoch": 0.9677787282577702, "grad_norm": 1.5924017390252279, "learning_rate": 2.7213789646088896e-08, "loss": 0.055, "step": 3394 }, { "epoch": 0.9680638722554891, "grad_norm": 1.4169645143655845, "learning_rate": 2.673468868812312e-08, "loss": 0.0179, "step": 3395 }, { "epoch": 0.9683490162532079, "grad_norm": 0.5948053783440101, "learning_rate": 2.625983126799514e-08, "loss": 0.0083, "step": 3396 }, { "epoch": 0.9686341602509267, "grad_norm": 1.575445770817084, "learning_rate": 2.57892177908875e-08, "loss": 0.0313, "step": 3397 }, { "epoch": 0.9689193042486456, "grad_norm": 1.500329123622438, "learning_rate": 2.53228486583601e-08, "loss": 0.0241, "step": 3398 }, { "epoch": 0.9692044482463644, "grad_norm": 1.311203579000811, "learning_rate": 2.4860724268351845e-08, "loss": 0.0378, "step": 3399 }, { "epoch": 0.9694895922440833, "grad_norm": 1.7134696799995803, "learning_rate": 2.4402845015180088e-08, "loss": 0.0345, "step": 3400 }, { "epoch": 0.9697747362418021, "grad_norm": 0.6710349220659272, "learning_rate": 2.3949211289538975e-08, "loss": 0.0073, "step": 3401 }, { "epoch": 0.9700598802395209, "grad_norm": 1.5504891835178136, "learning_rate": 2.3499823478499995e-08, "loss": 0.0237, "step": 3402 }, { "epoch": 0.9703450242372398, "grad_norm": 0.7081799782835921, "learning_rate": 2.305468196551308e-08, "loss": 0.0148, "step": 3403 }, { "epoch": 0.9706301682349586, "grad_norm": 0.9281303546894957, "learning_rate": 2.2613787130403854e-08, "loss": 0.0192, "step": 3404 }, { "epoch": 0.9709153122326775, "grad_norm": 1.27109390632808, "learning_rate": 2.217713934937471e-08, "loss": 0.0453, "step": 3405 }, { "epoch": 0.9712004562303963, "grad_norm": 0.6464573130091568, "learning_rate": 2.1744738995003733e-08, "loss": 0.0253, "step": 3406 }, { "epoch": 0.9714856002281153, "grad_norm": 1.119205251723142, "learning_rate": 2.131658643624579e-08, "loss": 0.0356, "step": 3407 }, { "epoch": 0.9717707442258341, "grad_norm": 1.4078034602604481, "learning_rate": 2.0892682038429758e-08, "loss": 0.0391, "step": 3408 }, { "epoch": 0.9720558882235529, "grad_norm": 0.6417882749586009, "learning_rate": 2.0473026163261302e-08, "loss": 0.0155, "step": 3409 }, { "epoch": 0.9723410322212718, "grad_norm": 0.5777849338634182, "learning_rate": 2.0057619168819544e-08, "loss": 0.0132, "step": 3410 }, { "epoch": 0.9726261762189906, "grad_norm": 0.7877201908002133, "learning_rate": 1.964646140955928e-08, "loss": 0.0282, "step": 3411 }, { "epoch": 0.9729113202167095, "grad_norm": 1.1195581875615601, "learning_rate": 1.923955323630877e-08, "loss": 0.0124, "step": 3412 }, { "epoch": 0.9731964642144283, "grad_norm": 1.1199818626019644, "learning_rate": 1.883689499627084e-08, "loss": 0.0156, "step": 3413 }, { "epoch": 0.9734816082121471, "grad_norm": 0.600014710134955, "learning_rate": 1.84384870330212e-08, "loss": 0.0217, "step": 3414 }, { "epoch": 0.973766752209866, "grad_norm": 0.980931450691564, "learning_rate": 1.8044329686509598e-08, "loss": 0.0199, "step": 3415 }, { "epoch": 0.9740518962075848, "grad_norm": 0.820500426709812, "learning_rate": 1.7654423293058666e-08, "loss": 0.0099, "step": 3416 }, { "epoch": 0.9743370402053037, "grad_norm": 0.7809793057569531, "learning_rate": 1.726876818536394e-08, "loss": 0.0146, "step": 3417 }, { "epoch": 0.9746221842030225, "grad_norm": 1.0867941197474569, "learning_rate": 1.6887364692493303e-08, "loss": 0.0171, "step": 3418 }, { "epoch": 0.9749073282007413, "grad_norm": 1.128048720541707, "learning_rate": 1.6510213139886987e-08, "loss": 0.0558, "step": 3419 }, { "epoch": 0.9751924721984602, "grad_norm": 1.548358358713432, "learning_rate": 1.613731384935702e-08, "loss": 0.0403, "step": 3420 }, { "epoch": 0.9754776161961791, "grad_norm": 1.50416589986925, "learning_rate": 1.5768667139086645e-08, "loss": 0.0432, "step": 3421 }, { "epoch": 0.975762760193898, "grad_norm": 1.3525365106907985, "learning_rate": 1.540427332363148e-08, "loss": 0.0193, "step": 3422 }, { "epoch": 0.9760479041916168, "grad_norm": 2.136152304647279, "learning_rate": 1.5044132713917803e-08, "loss": 0.0434, "step": 3423 }, { "epoch": 0.9763330481893356, "grad_norm": 0.703158736191946, "learning_rate": 1.4688245617243135e-08, "loss": 0.0141, "step": 3424 }, { "epoch": 0.9766181921870545, "grad_norm": 1.7291368031174819, "learning_rate": 1.4336612337274014e-08, "loss": 0.0177, "step": 3425 }, { "epoch": 0.9769033361847733, "grad_norm": 0.7954994827635179, "learning_rate": 1.3989233174050431e-08, "loss": 0.0233, "step": 3426 }, { "epoch": 0.9771884801824922, "grad_norm": 0.5057022729587526, "learning_rate": 1.3646108423978621e-08, "loss": 0.0083, "step": 3427 }, { "epoch": 0.977473624180211, "grad_norm": 1.395543096080145, "learning_rate": 1.3307238379838273e-08, "loss": 0.0238, "step": 3428 }, { "epoch": 0.9777587681779298, "grad_norm": 0.5812031981674576, "learning_rate": 1.2972623330775869e-08, "loss": 0.0112, "step": 3429 }, { "epoch": 0.9780439121756487, "grad_norm": 1.9594929069324818, "learning_rate": 1.2642263562309131e-08, "loss": 0.0526, "step": 3430 }, { "epoch": 0.9783290561733675, "grad_norm": 0.6563286778890409, "learning_rate": 1.2316159356323132e-08, "loss": 0.0078, "step": 3431 }, { "epoch": 0.9786142001710864, "grad_norm": 0.5324232039060124, "learning_rate": 1.1994310991074177e-08, "loss": 0.0108, "step": 3432 }, { "epoch": 0.9788993441688052, "grad_norm": 1.2259761094193657, "learning_rate": 1.1676718741184812e-08, "loss": 0.033, "step": 3433 }, { "epoch": 0.9791844881665241, "grad_norm": 1.365541853973848, "learning_rate": 1.1363382877647155e-08, "loss": 0.0183, "step": 3434 }, { "epoch": 0.979469632164243, "grad_norm": 1.8621831220471732, "learning_rate": 1.1054303667821232e-08, "loss": 0.0299, "step": 3435 }, { "epoch": 0.9797547761619618, "grad_norm": 0.6347679936689795, "learning_rate": 1.0749481375434966e-08, "loss": 0.0099, "step": 3436 }, { "epoch": 0.9800399201596807, "grad_norm": 1.1434239902853023, "learning_rate": 1.0448916260584752e-08, "loss": 0.0215, "step": 3437 }, { "epoch": 0.9803250641573995, "grad_norm": 0.7640056037278686, "learning_rate": 1.0152608579733214e-08, "loss": 0.0187, "step": 3438 }, { "epoch": 0.9806102081551183, "grad_norm": 0.9356956746133379, "learning_rate": 9.860558585710334e-09, "loss": 0.0122, "step": 3439 }, { "epoch": 0.9808953521528372, "grad_norm": 1.1473267478745108, "learning_rate": 9.57276652771455e-09, "loss": 0.0166, "step": 3440 }, { "epoch": 0.981180496150556, "grad_norm": 1.9021257652076382, "learning_rate": 9.289232651309432e-09, "loss": 0.0264, "step": 3441 }, { "epoch": 0.9814656401482749, "grad_norm": 0.630448042704614, "learning_rate": 9.009957198426455e-09, "loss": 0.0092, "step": 3442 }, { "epoch": 0.9817507841459937, "grad_norm": 1.1654682091314135, "learning_rate": 8.73494040736278e-09, "loss": 0.0467, "step": 3443 }, { "epoch": 0.9820359281437125, "grad_norm": 0.8094012015584077, "learning_rate": 8.46418251278236e-09, "loss": 0.0122, "step": 3444 }, { "epoch": 0.9823210721414314, "grad_norm": 0.43072831856408855, "learning_rate": 8.197683745713725e-09, "loss": 0.0099, "step": 3445 }, { "epoch": 0.9826062161391502, "grad_norm": 0.9313780942022813, "learning_rate": 7.935444333552756e-09, "loss": 0.0375, "step": 3446 }, { "epoch": 0.9828913601368692, "grad_norm": 1.86814045946998, "learning_rate": 7.677464500061015e-09, "loss": 0.0354, "step": 3447 }, { "epoch": 0.983176504134588, "grad_norm": 0.9242833759734096, "learning_rate": 7.423744465364091e-09, "loss": 0.017, "step": 3448 }, { "epoch": 0.9834616481323069, "grad_norm": 1.004654257677267, "learning_rate": 7.1742844459543605e-09, "loss": 0.0225, "step": 3449 }, { "epoch": 0.9837467921300257, "grad_norm": 0.5132997575591574, "learning_rate": 6.929084654688223e-09, "loss": 0.0076, "step": 3450 }, { "epoch": 0.9840319361277445, "grad_norm": 1.2537143990748647, "learning_rate": 6.688145300787208e-09, "loss": 0.0174, "step": 3451 }, { "epoch": 0.9843170801254634, "grad_norm": 0.856916087539409, "learning_rate": 6.451466589837974e-09, "loss": 0.0076, "step": 3452 }, { "epoch": 0.9846022241231822, "grad_norm": 1.2874699118028397, "learning_rate": 6.219048723790644e-09, "loss": 0.0161, "step": 3453 }, { "epoch": 0.984887368120901, "grad_norm": 0.7191572461319289, "learning_rate": 5.990891900961582e-09, "loss": 0.0143, "step": 3454 }, { "epoch": 0.9851725121186199, "grad_norm": 1.5720947624205361, "learning_rate": 5.766996316029505e-09, "loss": 0.0312, "step": 3455 }, { "epoch": 0.9854576561163387, "grad_norm": 1.5965820870491516, "learning_rate": 5.547362160037151e-09, "loss": 0.0483, "step": 3456 }, { "epoch": 0.9857428001140576, "grad_norm": 2.324661905639617, "learning_rate": 5.331989620392386e-09, "loss": 0.0569, "step": 3457 }, { "epoch": 0.9860279441117764, "grad_norm": 1.1089308013214734, "learning_rate": 5.120878880866542e-09, "loss": 0.0142, "step": 3458 }, { "epoch": 0.9863130881094953, "grad_norm": 0.8282424350000585, "learning_rate": 4.914030121593305e-09, "loss": 0.025, "step": 3459 }, { "epoch": 0.9865982321072141, "grad_norm": 1.2887917701143625, "learning_rate": 4.711443519070935e-09, "loss": 0.0357, "step": 3460 }, { "epoch": 0.986883376104933, "grad_norm": 0.948215811306342, "learning_rate": 4.513119246160602e-09, "loss": 0.0131, "step": 3461 }, { "epoch": 0.9871685201026519, "grad_norm": 1.220943268524718, "learning_rate": 4.3190574720858305e-09, "loss": 0.0326, "step": 3462 }, { "epoch": 0.9874536641003707, "grad_norm": 0.41899450985279857, "learning_rate": 4.129258362434163e-09, "loss": 0.0054, "step": 3463 }, { "epoch": 0.9877388080980896, "grad_norm": 0.647778329863355, "learning_rate": 3.943722079155499e-09, "loss": 0.0151, "step": 3464 }, { "epoch": 0.9880239520958084, "grad_norm": 1.802544256763322, "learning_rate": 3.762448780562089e-09, "loss": 0.034, "step": 3465 }, { "epoch": 0.9883090960935272, "grad_norm": 0.9733016714220596, "learning_rate": 3.585438621329096e-09, "loss": 0.0091, "step": 3466 }, { "epoch": 0.9885942400912461, "grad_norm": 0.7507618500967945, "learning_rate": 3.41269175249459e-09, "loss": 0.0134, "step": 3467 }, { "epoch": 0.9888793840889649, "grad_norm": 0.6719411044195046, "learning_rate": 3.2442083214573316e-09, "loss": 0.0074, "step": 3468 }, { "epoch": 0.9891645280866838, "grad_norm": 2.0659767490048186, "learning_rate": 3.0799884719795448e-09, "loss": 0.0223, "step": 3469 }, { "epoch": 0.9894496720844026, "grad_norm": 1.0486040613170182, "learning_rate": 2.920032344185253e-09, "loss": 0.0154, "step": 3470 }, { "epoch": 0.9897348160821214, "grad_norm": 1.226563276780761, "learning_rate": 2.7643400745602787e-09, "loss": 0.0185, "step": 3471 }, { "epoch": 0.9900199600798403, "grad_norm": 1.1703998955142922, "learning_rate": 2.612911795951689e-09, "loss": 0.0124, "step": 3472 }, { "epoch": 0.9903051040775591, "grad_norm": 0.6924588334370636, "learning_rate": 2.465747637568905e-09, "loss": 0.0075, "step": 3473 }, { "epoch": 0.9905902480752781, "grad_norm": 0.9080422538502102, "learning_rate": 2.322847724982591e-09, "loss": 0.0115, "step": 3474 }, { "epoch": 0.9908753920729969, "grad_norm": 0.8362530707299618, "learning_rate": 2.1842121801257666e-09, "loss": 0.0132, "step": 3475 }, { "epoch": 0.9911605360707157, "grad_norm": 0.4226196218790714, "learning_rate": 2.0498411212904746e-09, "loss": 0.0058, "step": 3476 }, { "epoch": 0.9914456800684346, "grad_norm": 1.8399346633917293, "learning_rate": 1.9197346631327774e-09, "loss": 0.0283, "step": 3477 }, { "epoch": 0.9917308240661534, "grad_norm": 0.7267003973721158, "learning_rate": 1.7938929166683161e-09, "loss": 0.0095, "step": 3478 }, { "epoch": 0.9920159680638723, "grad_norm": 1.8275007401191372, "learning_rate": 1.6723159892734209e-09, "loss": 0.0189, "step": 3479 }, { "epoch": 0.9923011120615911, "grad_norm": 0.9793326780518042, "learning_rate": 1.5550039846867758e-09, "loss": 0.0273, "step": 3480 }, { "epoch": 0.99258625605931, "grad_norm": 1.3801801510684655, "learning_rate": 1.4419570030071995e-09, "loss": 0.0356, "step": 3481 }, { "epoch": 0.9928714000570288, "grad_norm": 1.5847937802859948, "learning_rate": 1.3331751406936433e-09, "loss": 0.0445, "step": 3482 }, { "epoch": 0.9931565440547476, "grad_norm": 0.8944423135053186, "learning_rate": 1.2286584905668587e-09, "loss": 0.0219, "step": 3483 }, { "epoch": 0.9934416880524665, "grad_norm": 1.4836875534178315, "learning_rate": 1.1284071418077303e-09, "loss": 0.0213, "step": 3484 }, { "epoch": 0.9937268320501853, "grad_norm": 1.3626285135130423, "learning_rate": 1.0324211799578321e-09, "loss": 0.0259, "step": 3485 }, { "epoch": 0.9940119760479041, "grad_norm": 0.47759862221040755, "learning_rate": 9.407006869188718e-10, "loss": 0.0055, "step": 3486 }, { "epoch": 0.9942971200456231, "grad_norm": 0.5968701456927105, "learning_rate": 8.532457409532457e-10, "loss": 0.0082, "step": 3487 }, { "epoch": 0.9945822640433419, "grad_norm": 0.5594488628362712, "learning_rate": 7.700564166834845e-10, "loss": 0.0097, "step": 3488 }, { "epoch": 0.9948674080410608, "grad_norm": 1.836502535790717, "learning_rate": 6.911327850928074e-10, "loss": 0.0312, "step": 3489 }, { "epoch": 0.9951525520387796, "grad_norm": 0.7196938824374762, "learning_rate": 6.164749135240122e-10, "loss": 0.0261, "step": 3490 }, { "epoch": 0.9954376960364985, "grad_norm": 0.9928051386279392, "learning_rate": 5.460828656811412e-10, "loss": 0.0251, "step": 3491 }, { "epoch": 0.9957228400342173, "grad_norm": 0.9905040540471107, "learning_rate": 4.799567016267048e-10, "loss": 0.0167, "step": 3492 }, { "epoch": 0.9960079840319361, "grad_norm": 1.4772976262817499, "learning_rate": 4.1809647778501274e-10, "loss": 0.0144, "step": 3493 }, { "epoch": 0.996293128029655, "grad_norm": 1.617884852382969, "learning_rate": 3.605022469388431e-10, "loss": 0.0367, "step": 3494 }, { "epoch": 0.9965782720273738, "grad_norm": 1.9532885950196877, "learning_rate": 3.07174058232218e-10, "loss": 0.0321, "step": 3495 }, { "epoch": 0.9968634160250927, "grad_norm": 1.21126863713605, "learning_rate": 2.5811195716762827e-10, "loss": 0.0241, "step": 3496 }, { "epoch": 0.9971485600228115, "grad_norm": 0.8578791491329285, "learning_rate": 2.133159856093636e-10, "loss": 0.0158, "step": 3497 }, { "epoch": 0.9974337040205303, "grad_norm": 0.8397533407417973, "learning_rate": 1.7278618177962726e-10, "loss": 0.0104, "step": 3498 }, { "epoch": 0.9977188480182492, "grad_norm": 0.8670159276111888, "learning_rate": 1.3652258026186638e-10, "loss": 0.0264, "step": 3499 }, { "epoch": 0.998003992015968, "grad_norm": 1.4392317009722384, "learning_rate": 1.045252119979967e-10, "loss": 0.0371, "step": 3500 }, { "epoch": 0.998289136013687, "grad_norm": 0.8578595317936397, "learning_rate": 7.679410429117795e-11, "loss": 0.0353, "step": 3501 }, { "epoch": 0.9985742800114058, "grad_norm": 0.43156184395377595, "learning_rate": 5.332928080359345e-11, "loss": 0.01, "step": 3502 }, { "epoch": 0.9988594240091246, "grad_norm": 1.35487670264924, "learning_rate": 3.413076155645012e-11, "loss": 0.0233, "step": 3503 }, { "epoch": 0.9991445680068435, "grad_norm": 0.8191645811893169, "learning_rate": 1.9198562931088682e-11, "loss": 0.0199, "step": 3504 }, { "epoch": 0.9994297120045623, "grad_norm": 1.885688610242415, "learning_rate": 8.532697669538791e-12, "loss": 0.0381, "step": 3505 }, { "epoch": 0.9997148560022812, "grad_norm": 1.563181683513951, "learning_rate": 2.1331748722985823e-12, "loss": 0.0318, "step": 3506 }, { "epoch": 1.0, "grad_norm": 0.8918846626454352, "learning_rate": 0.0, "loss": 0.0097, "step": 3507 }, { "epoch": 1.0, "step": 3507, "total_flos": 28156764236288.0, "train_loss": 0.031115454253810983, "train_runtime": 8471.7741, "train_samples_per_second": 3.312, "train_steps_per_second": 0.414 } ], "logging_steps": 1.0, "max_steps": 3507, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1365, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 28156764236288.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }